aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/X86')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp616
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h36
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp33
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp969
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h107
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp195
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp696
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp (renamed from contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp)37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h (renamed from contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h)18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86.h18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86.td57
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp97
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp34
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp30
-rwxr-xr-xcontrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp89
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp269
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp155
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp438
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp1217
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp10041
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h1724
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp102
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp151
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td119
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td2614
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td179
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td157
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td70
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td97
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp206
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td199
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp1102
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h72
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td194
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td76
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td444
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td104
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td73
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td22
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp159
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp76
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp980
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h52
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp490
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td17
-rwxr-xr-xcontrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td57
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td87
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td65
-rwxr-xr-xcontrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td336
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td43
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td82
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td96
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp51
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp181
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp443
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h49
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp1766
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h119
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp99
125 files changed, 17919 insertions, 11135 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d37d812df485..a3014b2aba92 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -31,6 +31,7 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -39,6 +40,11 @@
using namespace llvm;
+static cl::opt<bool> LVIInlineAsmHardening(
+ "x86-experimental-lvi-inline-asm-hardening",
+ cl::desc("Harden inline assembly code that may be vulnerable to Load Value"
+ " Injection (LVI). This feature is experimental."), cl::Hidden);
+
static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
@@ -74,7 +80,7 @@ class X86AsmParser : public MCTargetAsmParser {
enum VEXEncoding {
VEXEncoding_Default,
- VEXEncoding_VEX2,
+ VEXEncoding_VEX,
VEXEncoding_VEX3,
VEXEncoding_EVEX,
};
@@ -326,6 +332,7 @@ private:
IES_PLUS,
IES_MINUS,
IES_OFFSET,
+ IES_CAST,
IES_NOT,
IES_MULTIPLY,
IES_DIVIDE,
@@ -352,6 +359,7 @@ private:
bool MemExpr;
bool OffsetOperator;
SMLoc OffsetOperatorLoc;
+ StringRef CurType;
bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) {
if (Sym) {
@@ -379,6 +387,7 @@ private:
unsigned getScale() { return Scale; }
const MCExpr *getSym() { return Sym; }
StringRef getSymName() { return SymName; }
+ StringRef getType() { return CurType; }
int64_t getImm() { return Imm + IC.execute(); }
bool isValidEndState() {
return State == IES_RBRAC || State == IES_INTEGER;
@@ -611,9 +620,9 @@ private:
}
bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
const InlineAsmIdentifierInfo &IDInfo,
- bool ParsingInlineAsm, StringRef &ErrMsg) {
+ bool ParsingMSInlineAsm, StringRef &ErrMsg) {
// InlineAsm: Treat an enum value as an integer
- if (ParsingInlineAsm)
+ if (ParsingMSInlineAsm)
if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
return onInteger(IDInfo.Enum.EnumVal, ErrMsg);
// Treat a symbolic constant like an integer
@@ -624,6 +633,7 @@ private:
default:
State = IES_ERROR;
break;
+ case IES_CAST:
case IES_PLUS:
case IES_MINUS:
case IES_NOT:
@@ -634,7 +644,7 @@ private:
MemExpr = true;
State = IES_INTEGER;
IC.pushOperand(IC_IMM);
- if (ParsingInlineAsm)
+ if (ParsingMSInlineAsm)
Info = IDInfo;
break;
}
@@ -736,6 +746,7 @@ private:
IC.pushOperator(IC_PLUS);
break;
case IES_INIT:
+ case IES_CAST:
assert(!BracCount && "BracCount should be zero on parsing's start");
State = IES_LBRAC;
break;
@@ -808,6 +819,7 @@ private:
case IES_INTEGER:
case IES_OFFSET:
case IES_REGISTER:
+ case IES_RBRAC:
case IES_RPAREN:
State = IES_RPAREN;
IC.pushOperator(IC_RPAREN);
@@ -815,7 +827,7 @@ private:
}
}
bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
- const InlineAsmIdentifierInfo &IDInfo, bool ParsingInlineAsm,
+ const InlineAsmIdentifierInfo &IDInfo, bool ParsingMSInlineAsm,
StringRef &ErrMsg) {
PrevState = State;
switch (State) {
@@ -833,13 +845,26 @@ private:
// As we cannot yet resolve the actual value (offset), we retain
// the requested semantics by pushing a '0' to the operands stack
IC.pushOperand(IC_IMM);
- if (ParsingInlineAsm) {
+ if (ParsingMSInlineAsm) {
Info = IDInfo;
}
break;
}
return false;
}
+ void onCast(StringRef Type) {
+ PrevState = State;
+ switch (State) {
+ default:
+ State = IES_ERROR;
+ break;
+ case IES_LPAREN:
+ setType(Type);
+ State = IES_CAST;
+ break;
+ }
+ }
+ void setType(StringRef Type) { CurType = Type; }
};
bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
@@ -858,6 +883,11 @@ private:
return nullptr;
}
+ bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc,
+ SMLoc EndLoc);
+ bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
+ bool RestoreOnFailure);
+
std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
bool IsSIReg(unsigned Reg);
@@ -896,10 +926,10 @@ private:
bool ParseIntelMemoryOperandSize(unsigned &Size);
std::unique_ptr<X86Operand>
- CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
- unsigned IndexReg, unsigned Scale, SMLoc Start,
- SMLoc End, unsigned Size, StringRef Identifier,
- const InlineAsmIdentifierInfo &Info);
+ CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+ unsigned IndexReg, unsigned Scale, SMLoc Start,
+ SMLoc End, unsigned Size, StringRef Identifier,
+ const InlineAsmIdentifierInfo &Info);
bool parseDirectiveEven(SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
@@ -927,9 +957,14 @@ private:
bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
bool processInstruction(MCInst &Inst, const OperandVector &Ops);
- /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+ // Load Value Injection (LVI) Mitigations for machine code
+ void emitWarningForSpecialLVIInstruction(SMLoc Loc);
+ void applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out);
+ void applyLVILoadHardeningMitigation(MCInst &Inst, MCStreamer &Out);
+
+ /// Wrapper around MCStreamer::emitInstruction(). Possibly adds
/// instrumentation around Inst.
- void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+ void emitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands, MCStreamer &Out,
@@ -1023,6 +1058,8 @@ public:
}
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
@@ -1129,36 +1166,21 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
return checkScale(Scale, ErrMsg);
}
-bool X86AsmParser::ParseRegister(unsigned &RegNo,
- SMLoc &StartLoc, SMLoc &EndLoc) {
- MCAsmParser &Parser = getParser();
- RegNo = 0;
- const AsmToken &PercentTok = Parser.getTok();
- StartLoc = PercentTok.getLoc();
-
+bool X86AsmParser::MatchRegisterByName(unsigned &RegNo, StringRef RegName,
+ SMLoc StartLoc, SMLoc EndLoc) {
// If we encounter a %, ignore it. This code handles registers with and
// without the prefix, unprefixed registers can occur in cfi directives.
- if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
- Parser.Lex(); // Eat percent token.
+ RegName.consume_front("%");
- const AsmToken &Tok = Parser.getTok();
- EndLoc = Tok.getEndLoc();
-
- if (Tok.isNot(AsmToken::Identifier)) {
- if (isParsingIntelSyntax()) return true;
- return Error(StartLoc, "invalid register name",
- SMRange(StartLoc, EndLoc));
- }
-
- RegNo = MatchRegisterName(Tok.getString());
+ RegNo = MatchRegisterName(RegName);
// If the match failed, try the register name as lowercase.
if (RegNo == 0)
- RegNo = MatchRegisterName(Tok.getString().lower());
+ RegNo = MatchRegisterName(RegName.lower());
// The "flags" and "mxcsr" registers cannot be referenced directly.
// Treat it as an identifier instead.
- if (isParsingInlineAsm() && isParsingIntelSyntax() &&
+ if (isParsingMSInlineAsm() && isParsingIntelSyntax() &&
(RegNo == X86::EFLAGS || RegNo == X86::MXCSR))
RegNo = 0;
@@ -1172,27 +1194,137 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
X86II::isX86_64NonExtLowByteReg(RegNo) ||
X86II::isX86_64ExtendedReg(RegNo)) {
- StringRef RegName = Tok.getString();
- Parser.Lex(); // Eat register name.
return Error(StartLoc,
"register %" + RegName + " is only available in 64-bit mode",
SMRange(StartLoc, EndLoc));
}
}
+ // If this is "db[0-15]", match it as an alias
+ // for dr[0-15].
+ if (RegNo == 0 && RegName.startswith("db")) {
+ if (RegName.size() == 3) {
+ switch (RegName[2]) {
+ case '0':
+ RegNo = X86::DR0;
+ break;
+ case '1':
+ RegNo = X86::DR1;
+ break;
+ case '2':
+ RegNo = X86::DR2;
+ break;
+ case '3':
+ RegNo = X86::DR3;
+ break;
+ case '4':
+ RegNo = X86::DR4;
+ break;
+ case '5':
+ RegNo = X86::DR5;
+ break;
+ case '6':
+ RegNo = X86::DR6;
+ break;
+ case '7':
+ RegNo = X86::DR7;
+ break;
+ case '8':
+ RegNo = X86::DR8;
+ break;
+ case '9':
+ RegNo = X86::DR9;
+ break;
+ }
+ } else if (RegName.size() == 4 && RegName[2] == '1') {
+ switch (RegName[3]) {
+ case '0':
+ RegNo = X86::DR10;
+ break;
+ case '1':
+ RegNo = X86::DR11;
+ break;
+ case '2':
+ RegNo = X86::DR12;
+ break;
+ case '3':
+ RegNo = X86::DR13;
+ break;
+ case '4':
+ RegNo = X86::DR14;
+ break;
+ case '5':
+ RegNo = X86::DR15;
+ break;
+ }
+ }
+ }
+
+ if (RegNo == 0) {
+ if (isParsingIntelSyntax())
+ return true;
+ return Error(StartLoc, "invalid register name", SMRange(StartLoc, EndLoc));
+ }
+ return false;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc, bool RestoreOnFailure) {
+ MCAsmParser &Parser = getParser();
+ MCAsmLexer &Lexer = getLexer();
+ RegNo = 0;
+
+ SmallVector<AsmToken, 5> Tokens;
+ auto OnFailure = [RestoreOnFailure, &Lexer, &Tokens]() {
+ if (RestoreOnFailure) {
+ while (!Tokens.empty()) {
+ Lexer.UnLex(Tokens.pop_back_val());
+ }
+ }
+ };
+
+ const AsmToken &PercentTok = Parser.getTok();
+ StartLoc = PercentTok.getLoc();
+
+ // If we encounter a %, ignore it. This code handles registers with and
+ // without the prefix, unprefixed registers can occur in cfi directives.
+ if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) {
+ Tokens.push_back(PercentTok);
+ Parser.Lex(); // Eat percent token.
+ }
+
+ const AsmToken &Tok = Parser.getTok();
+ EndLoc = Tok.getEndLoc();
+
+ if (Tok.isNot(AsmToken::Identifier)) {
+ OnFailure();
+ if (isParsingIntelSyntax()) return true;
+ return Error(StartLoc, "invalid register name",
+ SMRange(StartLoc, EndLoc));
+ }
+
+ if (MatchRegisterByName(RegNo, Tok.getString(), StartLoc, EndLoc)) {
+ OnFailure();
+ return true;
+ }
+
// Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
if (RegNo == X86::ST0) {
+ Tokens.push_back(Tok);
Parser.Lex(); // Eat 'st'
// Check to see if we have '(4)' after %st.
- if (getLexer().isNot(AsmToken::LParen))
+ if (Lexer.isNot(AsmToken::LParen))
return false;
// Lex the paren.
- getParser().Lex();
+ Tokens.push_back(Parser.getTok());
+ Parser.Lex();
const AsmToken &IntTok = Parser.getTok();
- if (IntTok.isNot(AsmToken::Integer))
+ if (IntTok.isNot(AsmToken::Integer)) {
+ OnFailure();
return Error(IntTok.getLoc(), "expected stack index");
+ }
switch (IntTok.getIntVal()) {
case 0: RegNo = X86::ST0; break;
case 1: RegNo = X86::ST1; break;
@@ -1202,11 +1334,18 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
case 5: RegNo = X86::ST5; break;
case 6: RegNo = X86::ST6; break;
case 7: RegNo = X86::ST7; break;
- default: return Error(IntTok.getLoc(), "invalid stack index");
+ default:
+ OnFailure();
+ return Error(IntTok.getLoc(), "invalid stack index");
}
- if (getParser().Lex().isNot(AsmToken::RParen))
+ // Lex IntTok
+ Tokens.push_back(IntTok);
+ Parser.Lex();
+ if (Lexer.isNot(AsmToken::RParen)) {
+ OnFailure();
return Error(Parser.getTok().getLoc(), "expected ')'");
+ }
EndLoc = Parser.getTok().getEndLoc();
Parser.Lex(); // Eat ')'
@@ -1215,41 +1354,8 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
EndLoc = Parser.getTok().getEndLoc();
- // If this is "db[0-15]", match it as an alias
- // for dr[0-15].
- if (RegNo == 0 && Tok.getString().startswith("db")) {
- if (Tok.getString().size() == 3) {
- switch (Tok.getString()[2]) {
- case '0': RegNo = X86::DR0; break;
- case '1': RegNo = X86::DR1; break;
- case '2': RegNo = X86::DR2; break;
- case '3': RegNo = X86::DR3; break;
- case '4': RegNo = X86::DR4; break;
- case '5': RegNo = X86::DR5; break;
- case '6': RegNo = X86::DR6; break;
- case '7': RegNo = X86::DR7; break;
- case '8': RegNo = X86::DR8; break;
- case '9': RegNo = X86::DR9; break;
- }
- } else if (Tok.getString().size() == 4 && Tok.getString()[2] == '1') {
- switch (Tok.getString()[3]) {
- case '0': RegNo = X86::DR10; break;
- case '1': RegNo = X86::DR11; break;
- case '2': RegNo = X86::DR12; break;
- case '3': RegNo = X86::DR13; break;
- case '4': RegNo = X86::DR14; break;
- case '5': RegNo = X86::DR15; break;
- }
- }
-
- if (RegNo != 0) {
- EndLoc = Parser.getTok().getEndLoc();
- Parser.Lex(); // Eat it.
- return false;
- }
- }
-
if (RegNo == 0) {
+ OnFailure();
if (isParsingIntelSyntax()) return true;
return Error(StartLoc, "invalid register name",
SMRange(StartLoc, EndLoc));
@@ -1259,6 +1365,25 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
return false;
}
+bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+}
+
+OperandMatchResultTy X86AsmParser::tryParseRegister(unsigned &RegNo,
+ SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ bool Result =
+ ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+ bool PendingErrors = getParser().hasPendingError();
+ getParser().clearPendingErrors();
+ if (PendingErrors)
+ return MatchOperand_ParseFail;
+ if (Result)
+ return MatchOperand_NoMatch;
+ return MatchOperand_Success;
+}
+
std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
bool Parse32 = is32BitMode() || Code16GCC;
unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
@@ -1405,7 +1530,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
return ParseATTOperand();
}
-std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
const InlineAsmIdentifierInfo &Info) {
@@ -1445,8 +1570,9 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
} else {
BaseReg = BaseReg ? BaseReg : 1;
return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
- IndexReg, Scale, Start, End, Size, Identifier,
- Decl, FrontendSize);
+ IndexReg, Scale, Start, End, Size,
+ /*DefaultBaseReg=*/X86::RIP, Identifier, Decl,
+ FrontendSize);
}
}
@@ -1483,7 +1609,7 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
return true;
StringRef ErrMsg;
ParseError =
- SM.onOffset(Val, OffsetLoc, ID, Info, isParsingInlineAsm(), ErrMsg);
+ SM.onOffset(Val, OffsetLoc, ID, Info, isParsingMSInlineAsm(), ErrMsg);
if (ParseError)
return Error(SMLoc::getFromPointer(Name.data()), ErrMsg);
} else {
@@ -1525,12 +1651,51 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
SMLoc IdentLoc = Tok.getLoc();
StringRef Identifier = Tok.getString();
UpdateLocLex = false;
- // Register
+ // (MASM only) <TYPE> PTR operator
+ if (Parser.isParsingMasm()) {
+ const AsmToken &NextTok = getLexer().peekTok();
+ if (NextTok.is(AsmToken::Identifier) &&
+ NextTok.getIdentifier().equals_lower("ptr")) {
+ SM.onCast(Identifier);
+ // Eat type and PTR.
+ consumeToken();
+ End = consumeToken();
+ break;
+ }
+ }
+ // Register, or (MASM only) <register>.<field>
unsigned Reg;
- if (Tok.is(AsmToken::Identifier) && !ParseRegister(Reg, IdentLoc, End)) {
- if (SM.onRegister(Reg, ErrMsg))
- return Error(Tok.getLoc(), ErrMsg);
- break;
+ if (Tok.is(AsmToken::Identifier)) {
+ if (!ParseRegister(Reg, IdentLoc, End, /*RestoreOnFailure=*/true)) {
+ if (SM.onRegister(Reg, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ break;
+ }
+ if (Parser.isParsingMasm()) {
+ const std::pair<StringRef, StringRef> IDField =
+ Tok.getString().split('.');
+ const StringRef ID = IDField.first, Field = IDField.second;
+ SMLoc IDEndLoc = SMLoc::getFromPointer(ID.data() + ID.size());
+ if (!Field.empty() &&
+ !MatchRegisterByName(Reg, ID, IdentLoc, IDEndLoc)) {
+ if (SM.onRegister(Reg, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+
+ StringRef Type;
+ unsigned Offset = 0;
+ SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data());
+ if (Parser.lookUpField(Field, Type, Offset))
+ return Error(FieldStartLoc, "unknown offset");
+ else if (SM.onPlus(ErrMsg))
+ return Error(getTok().getLoc(), ErrMsg);
+ else if (SM.onInteger(Offset, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ SM.setType(Type);
+
+ End = consumeToken();
+ break;
+ }
+ }
}
// Operator synonymous ("not", "or" etc.)
bool ParseError = false;
@@ -1542,37 +1707,40 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
// Symbol reference, when parsing assembly content
InlineAsmIdentifierInfo Info;
const MCExpr *Val;
- if (!isParsingInlineAsm()) {
- if (getParser().parsePrimaryExpr(Val, End)) {
- return Error(Tok.getLoc(), "Unexpected identifier!");
- } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
- return Error(IdentLoc, ErrMsg);
- } else
+ if (isParsingMSInlineAsm() || Parser.isParsingMasm()) {
+ // MS Dot Operator expression
+ if (Identifier.count('.') &&
+ (PrevTK == AsmToken::RBrac || PrevTK == AsmToken::RParen)) {
+ if (ParseIntelDotOperator(SM, End))
+ return true;
break;
+ }
}
- // MS InlineAsm operators (TYPE/LENGTH/SIZE)
- if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
- if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
- if (SM.onInteger(Val, ErrMsg))
- return Error(IdentLoc, ErrMsg);
- } else
- return true;
- break;
- }
- // MS Dot Operator expression
- if (Identifier.count('.') && PrevTK == AsmToken::RBrac) {
- if (ParseIntelDotOperator(SM, End))
+ if (isParsingMSInlineAsm()) {
+ // MS InlineAsm operators (TYPE/LENGTH/SIZE)
+ if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
+ if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
+ if (SM.onInteger(Val, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ } else
+ return true;
+ break;
+ }
+ // MS InlineAsm identifier
+ // Call parseIdentifier() to combine @ with the identifier behind it.
+ if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
+ return Error(IdentLoc, "expected identifier");
+ if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
return true;
+ else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
break;
}
- // MS InlineAsm identifier
- // Call parseIdentifier() to combine @ with the identifier behind it.
- if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
- return Error(IdentLoc, "expected identifier");
- if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
- return true;
- else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+ if (getParser().parsePrimaryExpr(Val, End)) {
+ return Error(Tok.getLoc(), "Unexpected identifier!");
+ } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
return Error(IdentLoc, ErrMsg);
+ }
break;
}
case AsmToken::Integer: {
@@ -1593,8 +1761,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return Error(Loc, "invalid reference to undefined symbol");
StringRef Identifier = Sym->getName();
InlineAsmIdentifierInfo Info;
- if (SM.onIdentifierExpr(Val, Identifier, Info,
- isParsingInlineAsm(), ErrMsg))
+ if (SM.onIdentifierExpr(Val, Identifier, Info, isParsingMSInlineAsm(),
+ ErrMsg))
return Error(Loc, ErrMsg);
End = consumeToken();
} else {
@@ -1688,7 +1856,7 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(
const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info,
bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator) {
MCAsmParser &Parser = getParser();
- assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+ assert(isParsingMSInlineAsm() && "Expected to be parsing inline assembly.");
Val = nullptr;
StringRef LineBuf(Identifier.data());
@@ -1777,9 +1945,11 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
}
/// Parse the '.' operator.
-bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) {
+bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
+ SMLoc &End) {
const AsmToken &Tok = getTok();
- unsigned Offset;
+ StringRef Type;
+ unsigned Offset = 0;
// Drop the optional '.'.
StringRef DotDispStr = Tok.getString();
@@ -1791,10 +1961,15 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End)
APInt DotDisp;
DotDispStr.getAsInteger(10, DotDisp);
Offset = DotDisp.getZExtValue();
- } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
- std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
- if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
- Offset))
+ } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) &&
+ Tok.is(AsmToken::Identifier)) {
+ const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+ const StringRef Base = BaseMember.first, Member = BaseMember.second;
+ if (getParser().lookUpField(SM.getType(), DotDispStr, Type, Offset) &&
+ getParser().lookUpField(SM.getSymName(), DotDispStr, Type, Offset) &&
+ getParser().lookUpField(DotDispStr, Type, Offset) &&
+ (!SemaCallback ||
+ SemaCallback->LookupInlineAsmField(Base, Member, Offset)))
return Error(Tok.getLoc(), "Unable to lookup field reference!");
} else
return Error(Tok.getLoc(), "Unexpected token type!");
@@ -1805,6 +1980,7 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End)
while (Tok.getLoc().getPointer() < DotExprEndLoc)
Lex();
SM.addImm(Offset);
+ SM.setType(Type);
return false;
}
@@ -1816,7 +1992,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
// Eat offset, mark start of identifier.
SMLoc Start = Lex().getLoc();
ID = getTok().getString();
- if (!isParsingInlineAsm()) {
+ if (!isParsingMSInlineAsm()) {
if ((getTok().isNot(AsmToken::Identifier) &&
getTok().isNot(AsmToken::String)) ||
getParser().parsePrimaryExpr(Val, End))
@@ -1939,7 +2115,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
if (ParseIntelExpression(SM, End))
return nullptr;
- if (isParsingInlineAsm())
+ if (isParsingMSInlineAsm())
RewriteIntelExpression(SM, Start, Tok.getLoc());
int64_t Imm = SM.getImm();
@@ -1953,7 +2129,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
// RegNo != 0 specifies a valid segment register,
// and we are parsing a segment override
if (!SM.isMemExpr() && !RegNo) {
- if (isParsingInlineAsm() && SM.isOffsetOperator()) {
+ if (isParsingMSInlineAsm() && SM.isOffsetOperator()) {
const InlineAsmIdentifierInfo Info = SM.getIdentifierInfo();
if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
// Disp includes the address of a variable; make sure this is recorded
@@ -2005,10 +2181,18 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
ErrMsg))
return ErrorOperand(Start, ErrMsg);
- if (isParsingInlineAsm())
- return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg,
- Scale, Start, End, Size, SM.getSymName(),
- SM.getIdentifierInfo());
+ if (isParsingMSInlineAsm())
+ return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start,
+ End, Size, SM.getSymName(),
+ SM.getIdentifierInfo());
+
+ // When parsing x64 MS-style assembly, all memory operands default to
+ // RIP-relative when interpreted as non-absolute references.
+ if (Parser.isParsingMasm() && is64BitMode())
+ return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, BaseReg,
+ IndexReg, Scale, Start, End, Size,
+ /*DefaultBaseReg=*/X86::RIP);
+
if (!(BaseReg || IndexReg || RegNo))
return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
@@ -2420,8 +2604,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
return Error(Parser.getTok().getLoc(), "Expected '}'");
Parser.Lex(); // Eat curly.
- if (Prefix == "vex2")
- ForcedVEXEncoding = VEXEncoding_VEX2;
+ if (Prefix == "vex" || Prefix == "vex2")
+ ForcedVEXEncoding = VEXEncoding_VEX;
else if (Prefix == "vex3")
ForcedVEXEncoding = VEXEncoding_VEX3;
else if (Prefix == "evex")
@@ -2711,7 +2895,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
// In MS inline asm curly braces mark the beginning/end of a block,
// therefore they should be interepreted as end of statement
CurlyAsEndOfStatement =
- isParsingIntelSyntax() && isParsingInlineAsm() &&
+ isParsingIntelSyntax() && isParsingMSInlineAsm() &&
(getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
return TokError("unexpected token in argument list");
@@ -3096,9 +3280,122 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
static const char *getSubtargetFeatureName(uint64_t Val);
-void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+void X86AsmParser::emitWarningForSpecialLVIInstruction(SMLoc Loc) {
+ Warning(Loc, "Instruction may be vulnerable to LVI and "
+ "requires manual mitigation");
+ Note(SMLoc(), "See https://software.intel.com/"
+ "security-software-guidance/insights/"
+ "deep-dive-load-value-injection#specialinstructions"
+ " for more information");
+}
+
+/// RET instructions and also instructions that indirect calls/jumps from memory
+/// combine a load and a branch within a single instruction. To mitigate these
+/// instructions against LVI, they must be decomposed into separate load and
+/// branch instructions, with an LFENCE in between. For more details, see:
+/// - X86LoadValueInjectionRetHardening.cpp
+/// - X86LoadValueInjectionIndirectThunks.cpp
+/// - https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection
+///
+/// Returns `true` if a mitigation was applied or warning was emitted.
+void X86AsmParser::applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out) {
+ // Information on control-flow instructions that require manual mitigation can
+ // be found here:
+ // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+ switch (Inst.getOpcode()) {
+ case X86::RETW:
+ case X86::RETL:
+ case X86::RETQ:
+ case X86::RETIL:
+ case X86::RETIQ:
+ case X86::RETIW: {
+ MCInst ShlInst, FenceInst;
+ bool Parse32 = is32BitMode() || Code16GCC;
+ unsigned Basereg =
+ is64BitMode() ? X86::RSP : (Parse32 ? X86::ESP : X86::SP);
+ const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+ auto ShlMemOp = X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+ /*BaseReg=*/Basereg, /*IndexReg=*/0,
+ /*Scale=*/1, SMLoc{}, SMLoc{}, 0);
+ ShlInst.setOpcode(X86::SHL64mi);
+ ShlMemOp->addMemOperands(ShlInst, 5);
+ ShlInst.addOperand(MCOperand::createImm(0));
+ FenceInst.setOpcode(X86::LFENCE);
+ Out.emitInstruction(ShlInst, getSTI());
+ Out.emitInstruction(FenceInst, getSTI());
+ return;
+ }
+ case X86::JMP16m:
+ case X86::JMP32m:
+ case X86::JMP64m:
+ case X86::CALL16m:
+ case X86::CALL32m:
+ case X86::CALL64m:
+ emitWarningForSpecialLVIInstruction(Inst.getLoc());
+ return;
+ }
+}
+
+/// To mitigate LVI, every instruction that performs a load can be followed by
+/// an LFENCE instruction to squash any potential mis-speculation. There are
+/// some instructions that require additional considerations, and may requre
+/// manual mitigation. For more details, see:
+/// https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection
+///
+/// Returns `true` if a mitigation was applied or warning was emitted.
+void X86AsmParser::applyLVILoadHardeningMitigation(MCInst &Inst,
+ MCStreamer &Out) {
+ auto Opcode = Inst.getOpcode();
+ auto Flags = Inst.getFlags();
+ if ((Flags & X86::IP_HAS_REPEAT) || (Flags & X86::IP_HAS_REPEAT_NE)) {
+ // Information on REP string instructions that require manual mitigation can
+ // be found here:
+ // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+ switch (Opcode) {
+ case X86::CMPSB:
+ case X86::CMPSW:
+ case X86::CMPSL:
+ case X86::CMPSQ:
+ case X86::SCASB:
+ case X86::SCASW:
+ case X86::SCASL:
+ case X86::SCASQ:
+ emitWarningForSpecialLVIInstruction(Inst.getLoc());
+ return;
+ }
+ } else if (Opcode == X86::REP_PREFIX || Opcode == X86::REPNE_PREFIX) {
+ // If a REP instruction is found on its own line, it may or may not be
+ // followed by a vulnerable instruction. Emit a warning just in case.
+ emitWarningForSpecialLVIInstruction(Inst.getLoc());
+ return;
+ }
+
+ const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+
+ // Can't mitigate after terminators or calls. A control flow change may have
+ // already occurred.
+ if (MCID.isTerminator() || MCID.isCall())
+ return;
+
+ // LFENCE has the mayLoad property, don't double fence.
+ if (MCID.mayLoad() && Inst.getOpcode() != X86::LFENCE) {
+ MCInst FenceInst;
+ FenceInst.setOpcode(X86::LFENCE);
+ Out.emitInstruction(FenceInst, getSTI());
+ }
+}
+
+void X86AsmParser::emitInstruction(MCInst &Inst, OperandVector &Operands,
MCStreamer &Out) {
- Out.EmitInstruction(Inst, getSTI());
+ if (LVIInlineAsmHardening &&
+ getSTI().getFeatureBits()[X86::FeatureLVIControlFlowIntegrity])
+ applyLVICFIMitigation(Inst, Out);
+
+ Out.emitInstruction(Inst, getSTI());
+
+ if (LVIInlineAsmHardening &&
+ getSTI().getFeatureBits()[X86::FeatureLVILoadHardening])
+ applyLVILoadHardeningMitigation(Inst, Out);
}
bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -3133,7 +3430,7 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
Inst.setOpcode(X86::WAIT);
Inst.setLoc(IDLoc);
if (!MatchingInlineAsm)
- EmitInstruction(Inst, Operands, Out);
+ emitInstruction(Inst, Operands, Out);
Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
}
}
@@ -3170,7 +3467,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
(MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX)
return Match_Unsupported;
- if ((ForcedVEXEncoding == VEXEncoding_VEX2 ||
+ if ((ForcedVEXEncoding == VEXEncoding_VEX ||
ForcedVEXEncoding == VEXEncoding_VEX3) &&
(MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
return Match_Unsupported;
@@ -3240,7 +3537,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
Inst.setLoc(IDLoc);
if (!MatchingInlineAsm)
- EmitInstruction(Inst, Operands, Out);
+ emitInstruction(Inst, Operands, Out);
Opcode = Inst.getOpcode();
return false;
case Match_InvalidImmUnsignedi4: {
@@ -3282,20 +3579,47 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
// Otherwise, we assume that this may be an integer instruction, which comes
// in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+ // MemSize corresponding to Suffixes. { 8, 16, 32, 64 } { 32, 64, 80, 0 }
+ const char *MemSize = Base[0] != 'f' ? "\x08\x10\x20\x40" : "\x20\x40\x50\0";
// Check for the various suffix matches.
uint64_t ErrorInfoIgnore;
FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings.
unsigned Match[4];
+ // Some instruction like VPMULDQ is NOT the variant of VPMULD but a new one.
+ // So we should make sure the suffix matcher only works for memory variant
+ // that has the same size with the suffix.
+ // FIXME: This flag is a workaround for legacy instructions that didn't
+ // declare non suffix variant assembly.
+ bool HasVectorReg = false;
+ X86Operand *MemOp = nullptr;
+ for (const auto &Op : Operands) {
+ X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+ if (X86Op->isVectorReg())
+ HasVectorReg = true;
+ else if (X86Op->isMem()) {
+ MemOp = X86Op;
+ assert(MemOp->Mem.Size == 0 && "Memory size always 0 under ATT syntax");
+ // Have we found an unqualified memory operand,
+ // break. IA allows only one memory operand.
+ break;
+ }
+ }
+
for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
Tmp.back() = Suffixes[I];
- Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
- MissingFeatures, MatchingInlineAsm,
- isParsingIntelSyntax());
- // If this returned as a missing feature failure, remember that.
- if (Match[I] == Match_MissingFeature)
- ErrorInfoMissingFeatures = MissingFeatures;
+ if (MemOp && HasVectorReg)
+ MemOp->Mem.Size = MemSize[I];
+ Match[I] = Match_MnemonicFail;
+ if (MemOp || !HasVectorReg) {
+ Match[I] =
+ MatchInstruction(Operands, Inst, ErrorInfoIgnore, MissingFeatures,
+ MatchingInlineAsm, isParsingIntelSyntax());
+ // If this returned as a missing feature failure, remember that.
+ if (Match[I] == Match_MissingFeature)
+ ErrorInfoMissingFeatures = MissingFeatures;
+ }
}
// Restore the old token.
@@ -3309,7 +3633,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
if (NumSuccessfulMatches == 1) {
Inst.setLoc(IDLoc);
if (!MatchingInlineAsm)
- EmitInstruction(Inst, Operands, Out);
+ emitInstruction(Inst, Operands, Out);
Opcode = Inst.getOpcode();
return false;
}
@@ -3562,7 +3886,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
;
Inst.setLoc(IDLoc);
if (!MatchingInlineAsm)
- EmitInstruction(Inst, Operands, Out);
+ emitInstruction(Inst, Operands, Out);
Opcode = Inst.getOpcode();
return false;
} else if (NumSuccessfulMatches > 1) {
@@ -3684,9 +4008,9 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
Section = getStreamer().getCurrentSectionOnly();
}
if (Section->UseCodeAlign())
- getStreamer().EmitCodeAlignment(2, 0);
+ getStreamer().emitCodeAlignment(2, 0);
else
- getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+ getStreamer().emitValueToAlignment(2, 0, 1, 0);
return false;
}
@@ -3699,7 +4023,7 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
Parser.Lex();
if (!is16BitMode()) {
SwitchMode(X86::Mode16Bit);
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
}
} else if (IDVal == ".code16gcc") {
// .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode.
@@ -3707,19 +4031,19 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
Code16GCC = true;
if (!is16BitMode()) {
SwitchMode(X86::Mode16Bit);
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
}
} else if (IDVal == ".code32") {
Parser.Lex();
if (!is32BitMode()) {
SwitchMode(X86::Mode32Bit);
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
}
} else if (IDVal == ".code64") {
Parser.Lex();
if (!is64BitMode()) {
SwitchMode(X86::Mode64Bit);
- getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+ getParser().getStreamer().emitAssemblerFlag(MCAF_Code64);
}
} else {
Error(L, "unknown directive " + IDVal);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
index d831a63b04ee..5cf4516ede97 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -17,9 +17,7 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SMLoc.h"
#include <cassert>
#include <memory>
@@ -60,6 +58,7 @@ struct X86Operand final : public MCParsedAsmOperand {
unsigned SegReg;
const MCExpr *Disp;
unsigned BaseReg;
+ unsigned DefaultBaseReg;
unsigned IndexReg;
unsigned Scale;
unsigned Size;
@@ -184,6 +183,10 @@ struct X86Operand final : public MCParsedAsmOperand {
assert(Kind == Memory && "Invalid access!");
return Mem.BaseReg;
}
+ unsigned getMemDefaultBaseReg() const {
+ assert(Kind == Memory && "Invalid access!");
+ return Mem.DefaultBaseReg;
+ }
unsigned getMemIndexReg() const {
assert(Kind == Memory && "Invalid access!");
return Mem.IndexReg;
@@ -312,6 +315,11 @@ struct X86Operand final : public MCParsedAsmOperand {
bool isMem512() const {
return Kind == Memory && (!Mem.Size || Mem.Size == 512);
}
+
+ bool isSibMem() const {
+ return isMem() && Mem.BaseReg != X86::RIP && Mem.BaseReg != X86::EIP;
+ }
+
bool isMemIndexReg(unsigned LowR, unsigned HighR) const {
assert(Kind == Memory && "Invalid access!");
return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR;
@@ -458,6 +466,14 @@ struct X86Operand final : public MCParsedAsmOperand {
X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
}
+ bool isVectorReg() const {
+ return Kind == Register &&
+ (X86MCRegisterClasses[X86::VR64RegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::VR128XRegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::VR256XRegClassID].contains(getReg()) ||
+ X86MCRegisterClasses[X86::VR512RegClassID].contains(getReg()));
+ }
+
bool isVK1Pair() const {
return Kind == Register &&
X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg());
@@ -540,7 +556,10 @@ struct X86Operand final : public MCParsedAsmOperand {
void addMemOperands(MCInst &Inst, unsigned N) const {
assert((N == 5) && "Invalid number of operands!");
- Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ if (getMemBaseReg())
+ Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+ else
+ Inst.addOperand(MCOperand::createReg(getMemDefaultBaseReg()));
Inst.addOperand(MCOperand::createImm(getMemScale()));
Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
addExpr(Inst, getMemDisp());
@@ -633,6 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand {
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = 0;
+ Res->Mem.DefaultBaseReg = 0;
Res->Mem.IndexReg = 0;
Res->Mem.Scale = 1;
Res->Mem.Size = Size;
@@ -648,11 +668,14 @@ struct X86Operand final : public MCParsedAsmOperand {
static std::unique_ptr<X86Operand>
CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
- SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
- void *OpDecl = nullptr, unsigned FrontendSize = 0) {
+ SMLoc EndLoc, unsigned Size = 0,
+ unsigned DefaultBaseReg = X86::NoRegister,
+ StringRef SymName = StringRef(), void *OpDecl = nullptr,
+ unsigned FrontendSize = 0) {
// We should never just have a displacement, that should be parsed as an
// absolute memory operand.
- assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+ assert((SegReg || BaseReg || IndexReg || DefaultBaseReg) &&
+ "Invalid memory operand!");
// The scale should always be one of {1,2,4,8}.
assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
@@ -661,6 +684,7 @@ struct X86Operand final : public MCParsedAsmOperand {
Res->Mem.SegReg = SegReg;
Res->Mem.Disp = Disp;
Res->Mem.BaseReg = BaseReg;
+ Res->Mem.DefaultBaseReg = DefaultBaseReg;
Res->Mem.IndexReg = IndexReg;
Res->Mem.Scale = Scale;
Res->Mem.Size = Size;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ea8c606d1564..a7fa1eb9a5ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -776,6 +776,10 @@ static int readModRM(struct InternalInstruction *insn) {
return prefix##_YMM0 + index; \
case TYPE_XMM: \
return prefix##_XMM0 + index; \
+ case TYPE_TMM: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_TMM0 + index; \
case TYPE_VK: \
index &= 0xf; \
if (index > 7) \
@@ -849,6 +853,7 @@ static int fixupReg(struct InternalInstruction *insn,
if (!valid)
return -1;
break;
+ case ENCODING_SIB:
CASE_ENCODING_RM:
if (insn->eaBase >= insn->eaRegBase) {
insn->eaBase = (EABase)fixupRMValue(
@@ -1533,6 +1538,15 @@ static int readOperands(struct InternalInstruction *insn) {
if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
break;
+ case ENCODING_SIB:
+ // Reject if SIB wasn't used.
+ if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
+ return -1;
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ break;
case ENCODING_REG:
CASE_ENCODING_RM:
if (readModRM(insn))
@@ -2006,9 +2020,11 @@ static bool translateRMRegister(MCInst &mcInst,
/// @param mcInst - The MCInst to append to.
/// @param insn - The instruction to extract Mod, R/M, and SIB fields
/// from.
+/// @param ForceSIB - The instruction must use SIB.
/// @return - 0 on success; nonzero otherwise
static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
- const MCDisassembler *Dis) {
+ const MCDisassembler *Dis,
+ bool ForceSIB = false) {
// Addresses in an MCInst are represented as five operands:
// 1. basereg (register) The R/M base, or (if there is a SIB) the
// SIB base
@@ -2067,11 +2083,12 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
// -Any base register used other than ESP/RSP/R12D/R12. Using these as a
// base always requires a SIB byte.
// -A scale other than 1 is used.
- if (insn.sibScale != 1 ||
- (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
- (insn.sibBase != SIB_BASE_NONE &&
- insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
- insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12)) {
+ if (!ForceSIB &&
+ (insn.sibScale != 1 ||
+ (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
+ (insn.sibBase != SIB_BASE_NONE &&
+ insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
+ insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12))) {
indexReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIZ :
X86::RIZ);
} else
@@ -2182,6 +2199,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_XMM:
case TYPE_YMM:
case TYPE_ZMM:
+ case TYPE_TMM:
case TYPE_VK_PAIR:
case TYPE_VK:
case TYPE_DEBUGREG:
@@ -2193,6 +2211,8 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_MVSIBY:
case TYPE_MVSIBZ:
return translateRMMemory(mcInst, insn, Dis);
+ case TYPE_MSIB:
+ return translateRMMemory(mcInst, insn, Dis, true);
}
}
@@ -2242,6 +2262,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
return false;
case ENCODING_WRITEMASK:
return translateMaskRegister(mcInst, insn.writemask);
+ case ENCODING_SIB:
CASE_ENCODING_RM:
CASE_ENCODING_VSIB:
return translateRM(mcInst, operand, insn, Dis);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 147fe46d81b9..4318c17f03a0 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -19,9 +19,6 @@
#include "llvm/Support/X86DisassemblerDecoderCommon.h"
namespace llvm {
-
-class MCInstrInfo;
-
namespace X86Disassembler {
// Accessor functions for various fields of an Intel instruction
@@ -383,6 +380,17 @@ namespace X86Disassembler {
ENTRY(BND2) \
ENTRY(BND3)
+#undef REGS_TMM
+#define REGS_TMM \
+ ENTRY(TMM0) \
+ ENTRY(TMM1) \
+ ENTRY(TMM2) \
+ ENTRY(TMM3) \
+ ENTRY(TMM4) \
+ ENTRY(TMM5) \
+ ENTRY(TMM6) \
+ ENTRY(TMM7)
+
#define ALL_EA_BASES \
EA_BASES_16BIT \
EA_BASES_32BIT \
@@ -407,6 +415,7 @@ namespace X86Disassembler {
REGS_DEBUG \
REGS_CONTROL \
REGS_BOUND \
+ REGS_TMM \
ENTRY(RIP)
/// All possible values of the base field for effective-address
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h b/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
index 5833017037a5..56738e9cfa73 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/ImmutableGraph.h
@@ -28,7 +28,6 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <iterator>
#include <utility>
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 675a9c377b12..0134b4efce72 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -56,7 +56,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
if (MI->getOpcode() == X86::CALLpcrel32 &&
(STI.getFeatureBits()[X86::Mode64Bit])) {
OS << "\tcallq\t";
- printPCRelImm(MI, 0, OS);
+ printPCRelImm(MI, Address, 0, OS);
}
// data16 and data32 both have the same encoding of 0x66. While data32 is
// valid only in 16 bit systems, data16 is valid in the rest.
@@ -68,8 +68,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
OS << "\tdata32";
}
// Try to print any aliases first.
- else if (!printAliasInstr(MI, OS) &&
- !printVecCompareInstr(MI, OS))
+ else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
printInstruction(MI, Address, OS);
// Next always print the annotation.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 3d5d384dc4a0..51ddae61d251 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -30,9 +30,10 @@ public:
// Autogenerated by tblgen, returns true if we successfully printed an
// alias.
- bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
- void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
- unsigned PrintMethodIdx, raw_ostream &O);
+ bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
+ raw_ostream &O);
// Autogenerated by tblgen.
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
@@ -46,13 +47,6 @@ public:
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
- void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
- void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
-
void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
printMemReference(MI, OpNo, O);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 2284cd7a70b8..bf3b6bcb5463 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -12,7 +12,9 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCELFObjectWriter.h"
@@ -60,10 +62,9 @@ public:
else if (BranchType == "indirect")
addKind(X86::AlignBranchIndirect);
else {
- report_fatal_error(
- "'-x86-align-branch 'The branches's type is combination of jcc, "
- "fused, jmp, call, ret, indirect.(plus separated)",
- false);
+ errs() << "invalid argument " << BranchType.str()
+ << " to -x86-align-branch=; each element must be one of: fused, "
+ "jcc, jmp, call, ret, indirect.(plus separated)\n";
}
}
}
@@ -86,12 +87,13 @@ cl::opt<unsigned> X86AlignBranchBoundary(
cl::opt<X86AlignBranchKind, true, cl::parser<std::string>> X86AlignBranch(
"x86-align-branch",
cl::desc(
- "Specify types of branches to align. The branches's types are "
- "combination of jcc, fused, jmp, call, ret, indirect. jcc indicates "
- "conditional jumps, fused indicates fused conditional jumps, jmp "
- "indicates unconditional jumps, call indicates direct and indirect "
- "calls, ret indicates rets, indirect indicates indirect jumps."),
- cl::value_desc("(plus separated list of types)"),
+ "Specify types of branches to align (plus separated list of types):"
+ "\njcc indicates conditional jumps"
+ "\nfused indicates fused conditional jumps"
+ "\njmp indicates direct unconditional jumps"
+ "\ncall indicates direct and indirect calls"
+ "\nret indicates rets"
+ "\nindirect indicates indirect unconditional jumps"),
cl::location(X86AlignBranchKindLoc));
cl::opt<bool> X86AlignBranchWithin32BBoundaries(
@@ -102,6 +104,18 @@ cl::opt<bool> X86AlignBranchWithin32BBoundaries(
"assumptions about labels corresponding to particular instructions, "
"and should be used with caution."));
+cl::opt<unsigned> X86PadMaxPrefixSize(
+ "x86-pad-max-prefix-size", cl::init(0),
+ cl::desc("Maximum number of prefixes to use for padding"));
+
+cl::opt<bool> X86PadForAlign(
+ "x86-pad-for-align", cl::init(true), cl::Hidden,
+ cl::desc("Pad previous instructions to implement align directives"));
+
+cl::opt<bool> X86PadForBranchAlign(
+ "x86-pad-for-branch-align", cl::init(true), cl::Hidden,
+ cl::desc("Pad previous instructions to implement branch alignment"));
+
class X86ELFObjectWriter : public MCELFObjectTargetWriter {
public:
X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
@@ -114,14 +128,18 @@ class X86AsmBackend : public MCAsmBackend {
std::unique_ptr<const MCInstrInfo> MCII;
X86AlignBranchKind AlignBranchType;
Align AlignBoundary;
+ unsigned TargetPrefixMax = 0;
- bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
-
- bool needAlign(MCObjectStreamer &OS) const;
- bool needAlignInst(const MCInst &Inst) const;
- MCBoundaryAlignFragment *
- getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const;
MCInst PrevInst;
+ MCBoundaryAlignFragment *PendingBA = nullptr;
+ std::pair<MCFragment *, size_t> PrevInstPosition;
+ bool CanPadInst;
+
+ uint8_t determinePaddingPrefix(const MCInst &Inst) const;
+ bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
+ bool needAlign(const MCInst &Inst) const;
+ bool canPadBranches(MCObjectStreamer &OS) const;
+ bool canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const;
public:
X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
@@ -142,11 +160,14 @@ public:
AlignBoundary = assumeAligned(X86AlignBranchBoundary);
if (X86AlignBranch.getNumOccurrences())
AlignBranchType = X86AlignBranchKindLoc;
+ if (X86PadMaxPrefixSize.getNumOccurrences())
+ TargetPrefixMax = X86PadMaxPrefixSize;
}
bool allowAutoPadding() const override;
- void alignBranchesBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
- void alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
+ bool allowEnhancedRelaxation() const override;
+ void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
+ void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
unsigned getNumFixupKinds() const override {
return X86::NumTargetFixupKinds;
@@ -155,7 +176,7 @@ public:
Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
-
+
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target) override;
@@ -171,22 +192,34 @@ public:
const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const override;
- void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
- MCInst &Res) const override;
+ void relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const override;
+
+ bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const;
+
+ bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const;
+
+ bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const;
+
+ void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
};
} // end anonymous namespace
-static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) {
unsigned Op = Inst.getOpcode();
switch (Op) {
default:
return Op;
case X86::JCC_1:
- return (is16BitMode) ? X86::JCC_2 : X86::JCC_4;
+ return (Is16BitMode) ? X86::JCC_2 : X86::JCC_4;
case X86::JMP_1:
- return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
+ return (Is16BitMode) ? X86::JMP_2 : X86::JMP_4;
}
}
@@ -275,11 +308,11 @@ static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
}
}
-static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
+static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) {
unsigned R = getRelaxedOpcodeArith(Inst);
if (R != Inst.getOpcode())
return R;
- return getRelaxedOpcodeBranch(Inst, is16BitMode);
+ return getRelaxedOpcodeBranch(Inst, Is16BitMode);
}
static X86::CondCode getCondFromBranch(const MCInst &MI,
@@ -316,6 +349,11 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) {
return (BaseReg == X86::RIP);
}
+/// Check if the instruction is a prefix.
+static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) {
+ return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags);
+}
+
/// Check if the instruction is valid as the first instruction in macro fusion.
static bool isFirstMacroFusibleInst(const MCInst &Inst,
const MCInstrInfo &MCII) {
@@ -327,6 +365,69 @@ static bool isFirstMacroFusibleInst(const MCInst &Inst,
return FIK != X86::FirstMacroFusionInstKind::Invalid;
}
+/// X86 can reduce the bytes of NOP by padding instructions with prefixes to
+/// get a better peformance in some cases. Here, we determine which prefix is
+/// the most suitable.
+///
+/// If the instruction has a segment override prefix, use the existing one.
+/// If the target is 64-bit, use the CS.
+/// If the target is 32-bit,
+/// - If the instruction has a ESP/EBP base register, use SS.
+/// - Otherwise use DS.
+uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
+ assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) &&
+ "Prefixes can be added only in 32-bit or 64-bit mode.");
+ const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ if (MemoryOperand != -1)
+ MemoryOperand += X86II::getOperandBias(Desc);
+
+ unsigned SegmentReg = 0;
+ if (MemoryOperand >= 0) {
+ // Check for explicit segment override on memory operand.
+ SegmentReg = Inst.getOperand(MemoryOperand + X86::AddrSegmentReg).getReg();
+ }
+
+ switch (TSFlags & X86II::FormMask) {
+ default:
+ break;
+ case X86II::RawFrmDstSrc: {
+ // Check segment override opcode prefix as needed (not for %ds).
+ if (Inst.getOperand(2).getReg() != X86::DS)
+ SegmentReg = Inst.getOperand(2).getReg();
+ break;
+ }
+ case X86II::RawFrmSrc: {
+ // Check segment override opcode prefix as needed (not for %ds).
+ if (Inst.getOperand(1).getReg() != X86::DS)
+ SegmentReg = Inst.getOperand(1).getReg();
+ break;
+ }
+ case X86II::RawFrmMemOffs: {
+ // Check segment override opcode prefix as needed.
+ SegmentReg = Inst.getOperand(1).getReg();
+ break;
+ }
+ }
+
+ if (SegmentReg != 0)
+ return X86::getSegmentOverridePrefixForReg(SegmentReg);
+
+ if (STI.hasFeature(X86::Mode64Bit))
+ return X86::CS_Encoding;
+
+ if (MemoryOperand >= 0) {
+ unsigned BaseRegNum = MemoryOperand + X86::AddrBaseReg;
+ unsigned BaseReg = Inst.getOperand(BaseRegNum).getReg();
+ if (BaseReg == X86::ESP || BaseReg == X86::EBP)
+ return X86::SS_Encoding;
+ }
+ return X86::DS_Encoding;
+}
+
/// Check if the two instructions will be macro-fused on the target cpu.
bool X86AsmBackend::isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const {
const MCInstrDesc &InstDesc = MCII->get(Jcc.getOpcode());
@@ -355,19 +456,122 @@ static bool hasVariantSymbol(const MCInst &MI) {
}
bool X86AsmBackend::allowAutoPadding() const {
- return (AlignBoundary != Align::None() &&
- AlignBranchType != X86::AlignBranchNone);
+ return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
+}
+
+bool X86AsmBackend::allowEnhancedRelaxation() const {
+ return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
+}
+
+/// X86 has certain instructions which enable interrupts exactly one
+/// instruction *after* the instruction which stores to SS. Return true if the
+/// given instruction has such an interrupt delay slot.
+static bool hasInterruptDelaySlot(const MCInst &Inst) {
+ switch (Inst.getOpcode()) {
+ case X86::POPSS16:
+ case X86::POPSS32:
+ case X86::STI:
+ return true;
+
+ case X86::MOV16sr:
+ case X86::MOV32sr:
+ case X86::MOV64sr:
+ case X86::MOV16sm:
+ if (Inst.getOperand(0).getReg() == X86::SS)
+ return true;
+ break;
+ }
+ return false;
+}
+
+/// Check if the instruction to be emitted is right after any data.
+static bool
+isRightAfterData(MCFragment *CurrentFragment,
+ const std::pair<MCFragment *, size_t> &PrevInstPosition) {
+ MCFragment *F = CurrentFragment;
+ // Empty data fragments may be created to prevent further data being
+ // added into the previous fragment, we need to skip them since they
+ // have no contents.
+ for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode())
+ if (cast<MCDataFragment>(F)->getContents().size() != 0)
+ break;
+
+ // Since data is always emitted into a DataFragment, our check strategy is
+ // simple here.
+ // - If the fragment is a DataFragment
+ // - If it's not the fragment where the previous instruction is,
+ // returns true.
+ // - If it's the fragment holding the previous instruction but its
+ // size changed since the the previous instruction was emitted into
+ // it, returns true.
+ // - Otherwise returns false.
+ // - If the fragment is not a DataFragment, returns false.
+ if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
+ return DF != PrevInstPosition.first ||
+ DF->getContents().size() != PrevInstPosition.second;
+
+ return false;
+}
+
+/// \returns the fragment size if it has instructions, otherwise returns 0.
+static size_t getSizeForInstFragment(const MCFragment *F) {
+ if (!F || !F->hasInstructions())
+ return 0;
+ // MCEncodedFragmentWithContents being templated makes this tricky.
+ switch (F->getKind()) {
+ default:
+ llvm_unreachable("Unknown fragment with instructions!");
+ case MCFragment::FT_Data:
+ return cast<MCDataFragment>(*F).getContents().size();
+ case MCFragment::FT_Relaxable:
+ return cast<MCRelaxableFragment>(*F).getContents().size();
+ case MCFragment::FT_CompactEncodedInst:
+ return cast<MCCompactEncodedInstFragment>(*F).getContents().size();
+ }
}
-bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
+/// Return true if we can insert NOP or prefixes automatically before the
+/// the instruction to be emitted.
+bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
+ if (hasVariantSymbol(Inst))
+ // Linker may rewrite the instruction with variant symbol operand(e.g.
+ // TLSCALL).
+ return false;
+
+ if (hasInterruptDelaySlot(PrevInst))
+ // If this instruction follows an interrupt enabling instruction with a one
+ // instruction delay, inserting a nop would change behavior.
+ return false;
+
+ if (isPrefix(PrevInst, *MCII))
+ // If this instruction follows a prefix, inserting a nop/prefix would change
+ // semantic.
+ return false;
+
+ if (isPrefix(Inst, *MCII))
+ // If this instruction is a prefix, inserting a prefix would change
+ // semantic.
+ return false;
+
+ if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
+ // If this instruction follows any data, there is no clear
+ // instruction boundary, inserting a nop/prefix would change semantic.
+ return false;
+
+ return true;
+}
+
+bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
if (!OS.getAllowAutoPadding())
return false;
assert(allowAutoPadding() && "incorrect initialization!");
- MCAssembler &Assembler = OS.getAssembler();
- MCSection *Sec = OS.getCurrentSectionOnly();
+ // We only pad in text section.
+ if (!OS.getCurrentSectionOnly()->getKind().isText())
+ return false;
+
// To be Done: Currently don't deal with Bundle cases.
- if (Assembler.isBundlingEnabled() && Sec->isBundleLocked())
+ if (OS.getAssembler().isBundlingEnabled())
return false;
// Branches only need to be aligned in 32-bit or 64-bit mode.
@@ -377,59 +581,42 @@ bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
return true;
}
-/// Check if the instruction operand needs to be aligned. Padding is disabled
-/// before intruction which may be rewritten by linker(e.g. TLSCALL).
-bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
- // Linker may rewrite the instruction with variant symbol operand.
- if (hasVariantSymbol(Inst))
- return false;
-
- const MCInstrDesc &InstDesc = MCII->get(Inst.getOpcode());
- return (InstDesc.isConditionalBranch() &&
+/// Check if the instruction operand needs to be aligned.
+bool X86AsmBackend::needAlign(const MCInst &Inst) const {
+ const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+ return (Desc.isConditionalBranch() &&
(AlignBranchType & X86::AlignBranchJcc)) ||
- (InstDesc.isUnconditionalBranch() &&
+ (Desc.isUnconditionalBranch() &&
(AlignBranchType & X86::AlignBranchJmp)) ||
- (InstDesc.isCall() &&
- (AlignBranchType & X86::AlignBranchCall)) ||
- (InstDesc.isReturn() &&
- (AlignBranchType & X86::AlignBranchRet)) ||
- (InstDesc.isIndirectBranch() &&
+ (Desc.isCall() && (AlignBranchType & X86::AlignBranchCall)) ||
+ (Desc.isReturn() && (AlignBranchType & X86::AlignBranchRet)) ||
+ (Desc.isIndirectBranch() &&
(AlignBranchType & X86::AlignBranchIndirect));
}
-static bool canReuseBoundaryAlignFragment(const MCBoundaryAlignFragment &F) {
- // If a MCBoundaryAlignFragment has not been used to emit NOP,we can reuse it.
- return !F.canEmitNops();
-}
+/// Insert BoundaryAlignFragment before instructions to align branches.
+void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
+ const MCInst &Inst) {
+ CanPadInst = canPadInst(Inst, OS);
-MCBoundaryAlignFragment *
-X86AsmBackend::getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const {
- auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(OS.getCurrentFragment());
- if (!F || !canReuseBoundaryAlignFragment(*F)) {
- F = new MCBoundaryAlignFragment(AlignBoundary);
- OS.insert(F);
- }
- return F;
-}
+ if (!canPadBranches(OS))
+ return;
+
+ if (!isMacroFused(PrevInst, Inst))
+ // Macro fusion doesn't happen indeed, clear the pending.
+ PendingBA = nullptr;
-/// Insert MCBoundaryAlignFragment before instructions to align branches.
-void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
- const MCInst &Inst) {
- if (!needAlign(OS))
+ if (!CanPadInst)
return;
- MCFragment *CF = OS.getCurrentFragment();
- bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused;
- if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
+ if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) {
// Macro fusion actually happens and there is no other fragment inserted
- // after the previous instruction. NOP can be emitted in PF to align fused
- // jcc.
- if (auto *PF =
- dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) {
- const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true);
- const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true);
- }
- } else if (needAlignInst(Inst)) {
+ // after the previous instruction.
+ //
+ // Do nothing here since we already inserted a BoudaryAlign fragment when
+ // we met the first instruction in the fused pair and we'll tie them
+ // together in emitInstructionEnd.
+ //
// Note: When there is at least one fragment, such as MCAlignFragment,
// inserted after the previous instruction, e.g.
//
@@ -441,34 +628,41 @@ void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
//
// We will treat the JCC as a unfused branch although it may be fused
// with the CMP.
- auto *F = getOrCreateBoundaryAlignFragment(OS);
- F->setEmitNops(true);
- F->setFused(false);
- } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) {
- // We don't know if macro fusion happens until the reaching the next
- // instruction, so a place holder is put here if necessary.
- getOrCreateBoundaryAlignFragment(OS);
+ return;
}
- PrevInst = Inst;
+ if (needAlign(Inst) || ((AlignBranchType & X86::AlignBranchFused) &&
+ isFirstMacroFusibleInst(Inst, *MCII))) {
+ // If we meet a unfused branch or the first instuction in a fusiable pair,
+ // insert a BoundaryAlign fragment.
+ OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary));
+ }
}
-/// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned
-/// if necessary.
-void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
- if (!needAlign(OS))
+/// Set the last fragment to be aligned for the BoundaryAlignFragment.
+void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {
+ PrevInst = Inst;
+ MCFragment *CF = OS.getCurrentFragment();
+ PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+ if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
+ F->setAllowAutoPadding(CanPadInst);
+
+ if (!canPadBranches(OS))
+ return;
+
+ if (!needAlign(Inst) || !PendingBA)
return;
- // If the branch is emitted into a MCRelaxableFragment, we can determine the
- // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the
- // branch is fused, the fused branch(macro fusion pair) must be emitted into
- // two fragments. Or when the branch is unfused, the branch must be emitted
- // into one fragment. The MCRelaxableFragment naturally marks the end of the
- // fused or unfused branch.
- // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of
- // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align
- // other branch.
- if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment()))
- OS.insert(new MCBoundaryAlignFragment(AlignBoundary));
+
+ // Tie the aligned instructions into a a pending BoundaryAlign.
+ PendingBA->setLastFragment(CF);
+ PendingBA = nullptr;
+
+ // We need to ensure that further data isn't added to the current
+ // DataFragment, so that we can get the size of instructions later in
+ // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
+ // DataFragment.
+ if (isa_and_nonnull<MCDataFragment>(CF))
+ OS.insert(new MCDataFragment());
// Update the maximum alignment on the current section if necessary.
MCSection *Sec = OS.getCurrentSectionOnly();
@@ -478,13 +672,23 @@ void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
if (STI.getTargetTriple().isOSBinFormatELF()) {
+ unsigned Type;
if (STI.getTargetTriple().getArch() == Triple::x86_64) {
- if (Name == "R_X86_64_NONE")
- return FK_NONE;
+ Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
+#undef ELF_RELOC
+ .Default(-1u);
} else {
- if (Name == "R_386_NONE")
- return FK_NONE;
+ Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/i386.def"
+#undef ELF_RELOC
+ .Default(-1u);
}
+ if (Type == -1u)
+ return None;
+ return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
}
return MCAsmBackend::getFixupKind(Name);
}
@@ -502,6 +706,11 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
{"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
};
+ // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They
+ // do not require any extra processing.
+ if (Kind >= FirstLiteralRelocationKind)
+ return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -514,7 +723,7 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
const MCFixup &Fixup,
const MCValue &) {
- return Fixup.getKind() == FK_NONE;
+ return Fixup.getKind() >= FirstLiteralRelocationKind;
}
static unsigned getFixupKindSize(unsigned Kind) {
@@ -556,7 +765,10 @@ void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
const MCSubtargetInfo *STI) const {
- unsigned Size = getFixupKindSize(Fixup.getKind());
+ unsigned Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return;
+ unsigned Size = getFixupKindSize(Kind);
assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
@@ -613,12 +825,11 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
// FIXME: Can tblgen help at all here to verify there aren't other instructions
// we can relax?
-void X86AsmBackend::relaxInstruction(const MCInst &Inst,
- const MCSubtargetInfo &STI,
- MCInst &Res) const {
+void X86AsmBackend::relaxInstruction(MCInst &Inst,
+ const MCSubtargetInfo &STI) const {
// The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
- bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
- unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode);
+ bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+ unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode);
if (RelaxedOp == Inst.getOpcode()) {
SmallString<256> Tmp;
@@ -628,8 +839,232 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst,
report_fatal_error("unexpected instruction to relax: " + OS.str());
}
- Res = Inst;
- Res.setOpcode(RelaxedOp);
+ Inst.setOpcode(RelaxedOp);
+}
+
+/// Return true if this instruction has been fully relaxed into it's most
+/// general available form.
+static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
+ auto &Inst = RF.getInst();
+ auto &STI = *RF.getSubtargetInfo();
+ bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+ return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
+}
+
+bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const {
+ if (!RF.getAllowAutoPadding())
+ return false;
+ // If the instruction isn't fully relaxed, shifting it around might require a
+ // larger value for one of the fixups then can be encoded. The outer loop
+ // will also catch this before moving to the next instruction, but we need to
+ // prevent padding this single instruction as well.
+ if (!isFullyRelaxed(RF))
+ return false;
+
+ const unsigned OldSize = RF.getContents().size();
+ if (OldSize == 15)
+ return false;
+
+ const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize);
+ const unsigned RemainingPrefixSize = [&]() -> unsigned {
+ SmallString<15> Code;
+ raw_svector_ostream VecOS(Code);
+ Emitter.emitPrefix(RF.getInst(), VecOS, STI);
+ assert(Code.size() < 15 && "The number of prefixes must be less than 15.");
+
+ // TODO: It turns out we need a decent amount of plumbing for the target
+ // specific bits to determine number of prefixes its safe to add. Various
+ // targets (older chips mostly, but also Atom family) encounter decoder
+ // stalls with too many prefixes. For testing purposes, we set the value
+ // externally for the moment.
+ unsigned ExistingPrefixSize = Code.size();
+ if (TargetPrefixMax <= ExistingPrefixSize)
+ return 0;
+ return TargetPrefixMax - ExistingPrefixSize;
+ }();
+ const unsigned PrefixBytesToAdd =
+ std::min(MaxPossiblePad, RemainingPrefixSize);
+ if (PrefixBytesToAdd == 0)
+ return false;
+
+ const uint8_t Prefix = determinePaddingPrefix(RF.getInst());
+
+ SmallString<256> Code;
+ Code.append(PrefixBytesToAdd, Prefix);
+ Code.append(RF.getContents().begin(), RF.getContents().end());
+ RF.getContents() = Code;
+
+ // Adjust the fixups for the change in offsets
+ for (auto &F : RF.getFixups()) {
+ F.setOffset(F.getOffset() + PrefixBytesToAdd);
+ }
+
+ RemainingSize -= PrefixBytesToAdd;
+ return true;
+}
+
+bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const {
+ if (isFullyRelaxed(RF))
+ // TODO: There are lots of other tricks we could apply for increasing
+ // encoding size without impacting performance.
+ return false;
+
+ MCInst Relaxed = RF.getInst();
+ relaxInstruction(Relaxed, *RF.getSubtargetInfo());
+
+ SmallVector<MCFixup, 4> Fixups;
+ SmallString<15> Code;
+ raw_svector_ostream VecOS(Code);
+ Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo());
+ const unsigned OldSize = RF.getContents().size();
+ const unsigned NewSize = Code.size();
+ assert(NewSize >= OldSize && "size decrease during relaxation?");
+ unsigned Delta = NewSize - OldSize;
+ if (Delta > RemainingSize)
+ return false;
+ RF.setInst(Relaxed);
+ RF.getContents() = Code;
+ RF.getFixups() = Fixups;
+ RemainingSize -= Delta;
+ return true;
+}
+
+bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+ MCCodeEmitter &Emitter,
+ unsigned &RemainingSize) const {
+ bool Changed = false;
+ if (RemainingSize != 0)
+ Changed |= padInstructionViaRelaxation(RF, Emitter, RemainingSize);
+ if (RemainingSize != 0)
+ Changed |= padInstructionViaPrefix(RF, Emitter, RemainingSize);
+ return Changed;
+}
+
+void X86AsmBackend::finishLayout(MCAssembler const &Asm,
+ MCAsmLayout &Layout) const {
+ // See if we can further relax some instructions to cut down on the number of
+ // nop bytes required for code alignment. The actual win is in reducing
+ // instruction count, not number of bytes. Modern X86-64 can easily end up
+ // decode limited. It is often better to reduce the number of instructions
+ // (i.e. eliminate nops) even at the cost of increasing the size and
+ // complexity of others.
+ if (!X86PadForAlign && !X86PadForBranchAlign)
+ return;
+
+ DenseSet<MCFragment *> LabeledFragments;
+ for (const MCSymbol &S : Asm.symbols())
+ LabeledFragments.insert(S.getFragment(false));
+
+ for (MCSection &Sec : Asm) {
+ if (!Sec.getKind().isText())
+ continue;
+
+ SmallVector<MCRelaxableFragment *, 4> Relaxable;
+ for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+ MCFragment &F = *I;
+
+ if (LabeledFragments.count(&F))
+ Relaxable.clear();
+
+ if (F.getKind() == MCFragment::FT_Data ||
+ F.getKind() == MCFragment::FT_CompactEncodedInst)
+ // Skip and ignore
+ continue;
+
+ if (F.getKind() == MCFragment::FT_Relaxable) {
+ auto &RF = cast<MCRelaxableFragment>(*I);
+ Relaxable.push_back(&RF);
+ continue;
+ }
+
+ auto canHandle = [](MCFragment &F) -> bool {
+ switch (F.getKind()) {
+ default:
+ return false;
+ case MCFragment::FT_Align:
+ return X86PadForAlign;
+ case MCFragment::FT_BoundaryAlign:
+ return X86PadForBranchAlign;
+ }
+ };
+ // For any unhandled kind, assume we can't change layout.
+ if (!canHandle(F)) {
+ Relaxable.clear();
+ continue;
+ }
+
+#ifndef NDEBUG
+ const uint64_t OrigOffset = Layout.getFragmentOffset(&F);
+#endif
+ const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F);
+
+ // To keep the effects local, prefer to relax instructions closest to
+ // the align directive. This is purely about human understandability
+ // of the resulting code. If we later find a reason to expand
+ // particular instructions over others, we can adjust.
+ MCFragment *FirstChangedFragment = nullptr;
+ unsigned RemainingSize = OrigSize;
+ while (!Relaxable.empty() && RemainingSize != 0) {
+ auto &RF = *Relaxable.pop_back_val();
+ // Give the backend a chance to play any tricks it wishes to increase
+ // the encoding size of the given instruction. Target independent code
+ // will try further relaxation, but target's may play further tricks.
+ if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
+ FirstChangedFragment = &RF;
+
+ // If we have an instruction which hasn't been fully relaxed, we can't
+ // skip past it and insert bytes before it. Changing its starting
+ // offset might require a larger negative offset than it can encode.
+ // We don't need to worry about larger positive offsets as none of the
+ // possible offsets between this and our align are visible, and the
+ // ones afterwards aren't changing.
+ if (!isFullyRelaxed(RF))
+ break;
+ }
+ Relaxable.clear();
+
+ if (FirstChangedFragment) {
+ // Make sure the offsets for any fragments in the effected range get
+ // updated. Note that this (conservatively) invalidates the offsets of
+ // those following, but this is not required.
+ Layout.invalidateFragmentsFrom(FirstChangedFragment);
+ }
+
+ // BoundaryAlign explicitly tracks it's size (unlike align)
+ if (F.getKind() == MCFragment::FT_BoundaryAlign)
+ cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
+
+#ifndef NDEBUG
+ const uint64_t FinalOffset = Layout.getFragmentOffset(&F);
+ const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F);
+ assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+ "can't move start of next fragment!");
+ assert(FinalSize == RemainingSize && "inconsistent size computation?");
+#endif
+
+ // If we're looking at a boundary align, make sure we don't try to pad
+ // its target instructions for some following directive. Doing so would
+ // break the alignment of the current boundary align.
+ if (auto *BF = dyn_cast<MCBoundaryAlignFragment>(&F)) {
+ const MCFragment *LastFragment = BF->getLastFragment();
+ if (!LastFragment)
+ continue;
+ while (&*I != LastFragment)
+ ++I;
+ }
+ }
+ }
+
+ // The layout is done. Mark every fragment as valid.
+ for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
+ MCSection &Section = *Layout.getSectionOrder()[i];
+ Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
+ Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
+ }
}
/// Write a sequence of optimal nops to the output, covering \p Count
@@ -661,7 +1096,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
// This CPU doesn't support long nops. If needed add more.
// FIXME: We could generated something better than plain 0x90.
- if (!STI.getFeatureBits()[X86::FeatureNOPL]) {
+ if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) {
for (uint64_t i = 0; i < Count; ++i)
OS << '\x90';
return true;
@@ -670,7 +1105,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
// 15-bytes is the longest single NOP instruction, but 10-bytes is
// commonly the longest that can be efficiently decoded.
uint64_t MaxNopLength = 10;
- if (STI.getFeatureBits()[X86::ProcIntelSLM])
+ if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
MaxNopLength = 7;
else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
MaxNopLength = 15;
@@ -811,6 +1246,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
enum { CU_NUM_SAVED_REGS = 6 };
mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+ Triple TT;
bool Is64Bit;
unsigned OffsetSize; ///< Offset of a "push" instruction.
@@ -838,10 +1274,140 @@ protected:
return 1;
}
+private:
+ /// Get the compact unwind number for a given register. The number
+ /// corresponds to the enum lists in compact_unwind_encoding.h.
+ int getCompactUnwindRegNum(unsigned Reg) const {
+ static const MCPhysReg CU32BitRegs[7] = {
+ X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+ };
+ static const MCPhysReg CU64BitRegs[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+ const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+ for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+ if (*CURegs == Reg)
+ return Idx;
+
+ return -1;
+ }
+
+ /// Return the registers encoded for a compact encoding with a frame
+ /// pointer.
+ uint32_t encodeCompactUnwindRegistersWithFrame() const {
+ // Encode the registers in the order they were saved --- 3-bits per
+ // register. The list of saved registers is assumed to be in reverse
+ // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+ uint32_t RegEnc = 0;
+ for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+ unsigned Reg = SavedRegs[i];
+ if (Reg == 0) break;
+
+ int CURegNum = getCompactUnwindRegNum(Reg);
+ if (CURegNum == -1) return ~0U;
+
+ // Encode the 3-bit register number in order, skipping over 3-bits for
+ // each register.
+ RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+ }
+
+ assert((RegEnc & 0x3FFFF) == RegEnc &&
+ "Invalid compact register encoding!");
+ return RegEnc;
+ }
+
+ /// Create the permutation encoding used with frameless stacks. It is
+ /// passed the number of registers to be saved and an array of the registers
+ /// saved.
+ uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+ // The saved registers are numbered from 1 to 6. In order to encode the
+ // order in which they were saved, we re-number them according to their
+ // place in the register order. The re-numbering is relative to the last
+ // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+ // that order:
+ //
+ // Orig Re-Num
+ // ---- ------
+ // 6 6
+ // 2 2
+ // 4 3
+ // 5 3
+ //
+ for (unsigned i = 0; i < RegCount; ++i) {
+ int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+ if (CUReg == -1) return ~0U;
+ SavedRegs[i] = CUReg;
+ }
+
+ // Reverse the list.
+ std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+ uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+ for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+ unsigned Countless = 0;
+ for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+ if (SavedRegs[j] < SavedRegs[i])
+ ++Countless;
+
+ RenumRegs[i] = SavedRegs[i] - Countless - 1;
+ }
+
+ // Take the renumbered values and encode them into a 10-bit number.
+ uint32_t permutationEncoding = 0;
+ switch (RegCount) {
+ case 6:
+ permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+ + 6 * RenumRegs[2] + 2 * RenumRegs[3]
+ + RenumRegs[4];
+ break;
+ case 5:
+ permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+ + 6 * RenumRegs[3] + 2 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 4:
+ permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3]
+ + 3 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 3:
+ permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4]
+ + RenumRegs[5];
+ break;
+ case 2:
+ permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5];
+ break;
+ case 1:
+ permutationEncoding |= RenumRegs[5];
+ break;
+ }
+
+ assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+ "Invalid compact register encoding!");
+ return permutationEncoding;
+ }
+
+public:
+ DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+ const MCSubtargetInfo &STI)
+ : X86AsmBackend(T, STI), MRI(MRI), TT(STI.getTargetTriple()),
+ Is64Bit(TT.isArch64Bit()) {
+ memset(SavedRegs, 0, sizeof(SavedRegs));
+ OffsetSize = Is64Bit ? 8 : 4;
+ MoveInstrSize = Is64Bit ? 3 : 2;
+ StackDivide = Is64Bit ? 8 : 4;
+ }
+
+ std::unique_ptr<MCObjectTargetWriter>
+ createObjectTargetWriter() const override {
+ uint32_t CPUType = cantFail(MachO::getCPUType(TT));
+ uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TT));
+ return createX86MachObjectWriter(Is64Bit, CPUType, CPUSubType);
+ }
+
/// Implementation of algorithm to generate the compact unwind encoding
/// for the CFI instructions.
uint32_t
- generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+ generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const override {
if (Instrs.empty()) return 0;
// Reset the saved registers.
@@ -904,7 +1470,7 @@ protected:
// L0:
// .cfi_def_cfa_offset 80
//
- StackSize = std::abs(Inst.getOffset()) / StackDivide;
+ StackSize = Inst.getOffset() / StackDivide;
++NumDefCFAOffsets;
break;
}
@@ -991,168 +1557,6 @@ protected:
return CompactUnwindEncoding;
}
-
-private:
- /// Get the compact unwind number for a given register. The number
- /// corresponds to the enum lists in compact_unwind_encoding.h.
- int getCompactUnwindRegNum(unsigned Reg) const {
- static const MCPhysReg CU32BitRegs[7] = {
- X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
- };
- static const MCPhysReg CU64BitRegs[] = {
- X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
- };
- const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
- for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
- if (*CURegs == Reg)
- return Idx;
-
- return -1;
- }
-
- /// Return the registers encoded for a compact encoding with a frame
- /// pointer.
- uint32_t encodeCompactUnwindRegistersWithFrame() const {
- // Encode the registers in the order they were saved --- 3-bits per
- // register. The list of saved registers is assumed to be in reverse
- // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
- uint32_t RegEnc = 0;
- for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
- unsigned Reg = SavedRegs[i];
- if (Reg == 0) break;
-
- int CURegNum = getCompactUnwindRegNum(Reg);
- if (CURegNum == -1) return ~0U;
-
- // Encode the 3-bit register number in order, skipping over 3-bits for
- // each register.
- RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
- }
-
- assert((RegEnc & 0x3FFFF) == RegEnc &&
- "Invalid compact register encoding!");
- return RegEnc;
- }
-
- /// Create the permutation encoding used with frameless stacks. It is
- /// passed the number of registers to be saved and an array of the registers
- /// saved.
- uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
- // The saved registers are numbered from 1 to 6. In order to encode the
- // order in which they were saved, we re-number them according to their
- // place in the register order. The re-numbering is relative to the last
- // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
- // that order:
- //
- // Orig Re-Num
- // ---- ------
- // 6 6
- // 2 2
- // 4 3
- // 5 3
- //
- for (unsigned i = 0; i < RegCount; ++i) {
- int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
- if (CUReg == -1) return ~0U;
- SavedRegs[i] = CUReg;
- }
-
- // Reverse the list.
- std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
-
- uint32_t RenumRegs[CU_NUM_SAVED_REGS];
- for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
- unsigned Countless = 0;
- for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
- if (SavedRegs[j] < SavedRegs[i])
- ++Countless;
-
- RenumRegs[i] = SavedRegs[i] - Countless - 1;
- }
-
- // Take the renumbered values and encode them into a 10-bit number.
- uint32_t permutationEncoding = 0;
- switch (RegCount) {
- case 6:
- permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
- + 6 * RenumRegs[2] + 2 * RenumRegs[3]
- + RenumRegs[4];
- break;
- case 5:
- permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
- + 6 * RenumRegs[3] + 2 * RenumRegs[4]
- + RenumRegs[5];
- break;
- case 4:
- permutationEncoding |= 60 * RenumRegs[2] + 12 * RenumRegs[3]
- + 3 * RenumRegs[4] + RenumRegs[5];
- break;
- case 3:
- permutationEncoding |= 20 * RenumRegs[3] + 4 * RenumRegs[4]
- + RenumRegs[5];
- break;
- case 2:
- permutationEncoding |= 5 * RenumRegs[4] + RenumRegs[5];
- break;
- case 1:
- permutationEncoding |= RenumRegs[5];
- break;
- }
-
- assert((permutationEncoding & 0x3FF) == permutationEncoding &&
- "Invalid compact register encoding!");
- return permutationEncoding;
- }
-
-public:
- DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI, bool Is64Bit)
- : X86AsmBackend(T, STI), MRI(MRI), Is64Bit(Is64Bit) {
- memset(SavedRegs, 0, sizeof(SavedRegs));
- OffsetSize = Is64Bit ? 8 : 4;
- MoveInstrSize = Is64Bit ? 3 : 2;
- StackDivide = Is64Bit ? 8 : 4;
- }
-};
-
-class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
-public:
- DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI)
- : DarwinX86AsmBackend(T, MRI, STI, false) {}
-
- std::unique_ptr<MCObjectTargetWriter>
- createObjectTargetWriter() const override {
- return createX86MachObjectWriter(/*Is64Bit=*/false,
- MachO::CPU_TYPE_I386,
- MachO::CPU_SUBTYPE_I386_ALL);
- }
-
- /// Generate the compact unwind encoding for the CFI instructions.
- uint32_t generateCompactUnwindEncoding(
- ArrayRef<MCCFIInstruction> Instrs) const override {
- return generateCompactUnwindEncodingImpl(Instrs);
- }
-};
-
-class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
- const MachO::CPUSubTypeX86 Subtype;
-public:
- DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI, MachO::CPUSubTypeX86 st)
- : DarwinX86AsmBackend(T, MRI, STI, true), Subtype(st) {}
-
- std::unique_ptr<MCObjectTargetWriter>
- createObjectTargetWriter() const override {
- return createX86MachObjectWriter(/*Is64Bit=*/true, MachO::CPU_TYPE_X86_64,
- Subtype);
- }
-
- /// Generate the compact unwind encoding for the CFI instructions.
- uint32_t generateCompactUnwindEncoding(
- ArrayRef<MCCFIInstruction> Instrs) const override {
- return generateCompactUnwindEncodingImpl(Instrs);
- }
};
} // end anonymous namespace
@@ -1163,7 +1567,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
const MCTargetOptions &Options) {
const Triple &TheTriple = STI.getTargetTriple();
if (TheTriple.isOSBinFormatMachO())
- return new DarwinX86_32AsmBackend(T, MRI, STI);
+ return new DarwinX86AsmBackend(T, MRI, STI);
if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
return new WindowsX86AsmBackend(T, false, STI);
@@ -1181,13 +1585,8 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
const Triple &TheTriple = STI.getTargetTriple();
- if (TheTriple.isOSBinFormatMachO()) {
- MachO::CPUSubTypeX86 CS =
- StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
- .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
- .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
- return new DarwinX86_64AsmBackend(T, MRI, STI, CS);
- }
+ if (TheTriple.isOSBinFormatMachO())
+ return new DarwinX86AsmBackend(T, MRI, STI);
if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
return new WindowsX86AsmBackend(T, true, STI);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index a4f8dd669e1e..79f07d3c7792 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -91,7 +91,7 @@ namespace X86 {
COND_G = 15,
LAST_VALID_COND = COND_G,
- // Artificial condition codes. These are used by AnalyzeBranch
+ // Artificial condition codes. These are used by analyzeBranch
// to indicate a block terminated with two conditional branches that together
// form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
// which can't be represented on x86 with a single condition. These
@@ -356,6 +356,39 @@ namespace X86 {
AlignBranchRet = 1U << 4,
AlignBranchIndirect = 1U << 5
};
+
+ /// Defines the encoding values for segment override prefix.
+ enum EncodingOfSegmentOverridePrefix : uint8_t {
+ CS_Encoding = 0x2E,
+ DS_Encoding = 0x3E,
+ ES_Encoding = 0x26,
+ FS_Encoding = 0x64,
+ GS_Encoding = 0x65,
+ SS_Encoding = 0x36
+ };
+
+ /// Given a segment register, return the encoding of the segment override
+ /// prefix for it.
+ inline EncodingOfSegmentOverridePrefix
+ getSegmentOverridePrefixForReg(unsigned Reg) {
+ switch (Reg) {
+ default:
+ llvm_unreachable("Unknown segment register!");
+ case X86::CS:
+ return CS_Encoding;
+ case X86::DS:
+ return DS_Encoding;
+ case X86::ES:
+ return ES_Encoding;
+ case X86::FS:
+ return FS_Encoding;
+ case X86::GS:
+ return GS_Encoding;
+ case X86::SS:
+ return SS_Encoding;
+ }
+ }
+
} // end namespace X86;
/// X86II - This namespace holds all of the target specific flags that
@@ -581,90 +614,107 @@ namespace X86II {
/// in the lower 4 bits of the opcode.
AddCCFrm = 9,
+ /// PrefixByte - This form is used for instructions that represent a prefix
+ /// byte like data16 or rep.
+ PrefixByte = 10,
+
/// MRM[0-7][rm] - These forms are used to represent instructions that use
/// a Mod/RM byte, and use the middle field to hold extended opcode
/// information. In the intel manual these are represented as /0, /1, ...
///
+ // Instructions operate on a register Reg/Opcode operand not the r/m field.
+ MRMr0 = 21,
+
+ /// MRMSrcMem - But force to use the SIB field.
+ MRMSrcMemFSIB = 22,
+
+ /// MRMDestMem - But force to use the SIB field.
+ MRMDestMemFSIB = 23,
+
/// MRMDestMem - This form is used for instructions that use the Mod/RM byte
/// to specify a destination, which in this case is memory.
///
- MRMDestMem = 32,
+ MRMDestMem = 24,
/// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
/// to specify a source, which in this case is memory.
///
- MRMSrcMem = 33,
+ MRMSrcMem = 25,
/// MRMSrcMem4VOp3 - This form is used for instructions that encode
/// operand 3 with VEX.VVVV and load from memory.
///
- MRMSrcMem4VOp3 = 34,
+ MRMSrcMem4VOp3 = 26,
/// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM
/// byte to specify the fourth source, which in this case is memory.
///
- MRMSrcMemOp4 = 35,
+ MRMSrcMemOp4 = 27,
/// MRMSrcMemCC - This form is used for instructions that use the Mod/RM
/// byte to specify the operands and also encodes a condition code.
///
- MRMSrcMemCC = 36,
+ MRMSrcMemCC = 28,
/// MRMXm - This form is used for instructions that use the Mod/RM byte
/// to specify a memory source, but doesn't use the middle field. And has
/// a condition code.
///
- MRMXmCC = 38,
+ MRMXmCC = 30,
/// MRMXm - This form is used for instructions that use the Mod/RM byte
/// to specify a memory source, but doesn't use the middle field.
///
- MRMXm = 39,
+ MRMXm = 31,
// Next, instructions that operate on a memory r/m operand...
- MRM0m = 40, MRM1m = 41, MRM2m = 42, MRM3m = 43, // Format /0 /1 /2 /3
- MRM4m = 44, MRM5m = 45, MRM6m = 46, MRM7m = 47, // Format /4 /5 /6 /7
+ MRM0m = 32, MRM1m = 33, MRM2m = 34, MRM3m = 35, // Format /0 /1 /2 /3
+ MRM4m = 36, MRM5m = 37, MRM6m = 38, MRM7m = 39, // Format /4 /5 /6 /7
/// MRMDestReg - This form is used for instructions that use the Mod/RM byte
/// to specify a destination, which in this case is a register.
///
- MRMDestReg = 48,
+ MRMDestReg = 40,
/// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
/// to specify a source, which in this case is a register.
///
- MRMSrcReg = 49,
+ MRMSrcReg = 41,
/// MRMSrcReg4VOp3 - This form is used for instructions that encode
/// operand 3 with VEX.VVVV and do not load from memory.
///
- MRMSrcReg4VOp3 = 50,
+ MRMSrcReg4VOp3 = 42,
/// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM
/// byte to specify the fourth source, which in this case is a register.
///
- MRMSrcRegOp4 = 51,
+ MRMSrcRegOp4 = 43,
/// MRMSrcRegCC - This form is used for instructions that use the Mod/RM
/// byte to specify the operands and also encodes a condition code
///
- MRMSrcRegCC = 52,
+ MRMSrcRegCC = 44,
/// MRMXCCr - This form is used for instructions that use the Mod/RM byte
/// to specify a register source, but doesn't use the middle field. And has
/// a condition code.
///
- MRMXrCC = 54,
+ MRMXrCC = 46,
/// MRMXr - This form is used for instructions that use the Mod/RM byte
/// to specify a register source, but doesn't use the middle field.
///
- MRMXr = 55,
+ MRMXr = 47,
// Instructions that operate on a register r/m operand...
- MRM0r = 56, MRM1r = 57, MRM2r = 58, MRM3r = 59, // Format /0 /1 /2 /3
- MRM4r = 60, MRM5r = 61, MRM6r = 62, MRM7r = 63, // Format /4 /5 /6 /7
+ MRM0r = 48, MRM1r = 49, MRM2r = 50, MRM3r = 51, // Format /0 /1 /2 /3
+ MRM4r = 52, MRM5r = 53, MRM6r = 54, MRM7r = 55, // Format /4 /5 /6 /7
+
+ // Instructions that operate that have mod=11 and an opcode but ignore r/m.
+ MRM0X = 56, MRM1X = 57, MRM2X = 58, MRM3X = 59, // Format /0 /1 /2 /3
+ MRM4X = 60, MRM5X = 61, MRM6X = 62, MRM7X = 63, // Format /4 /5 /6 /7
/// MRM_XX - A mod/rm byte of exactly 0xXX.
MRM_C0 = 64, MRM_C1 = 65, MRM_C2 = 66, MRM_C3 = 67,
@@ -900,6 +950,16 @@ namespace X86II {
NOTRACK = 1ULL << NoTrackShift
};
+ /// \returns true if the instruction with given opcode is a prefix.
+ inline bool isPrefix(uint64_t TSFlags) {
+ return (TSFlags & X86II::FormMask) == PrefixByte;
+ }
+
+ /// \returns true if the instruction with given opcode is a pseudo.
+ inline bool isPseudo(uint64_t TSFlags) {
+ return (TSFlags & X86II::FormMask) == Pseudo;
+ }
+
/// \returns the "base" X86 opcode for the specified machine
/// instruction.
inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) {
@@ -1028,10 +1088,13 @@ namespace X86II {
case X86II::RawFrmDst:
case X86II::RawFrmDstSrc:
case X86II::AddCCFrm:
+ case X86II::PrefixByte:
return -1;
case X86II::MRMDestMem:
+ case X86II::MRMDestMemFSIB:
return 0;
case X86II::MRMSrcMem:
+ case X86II::MRMSrcMemFSIB:
// Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
// mask register.
return 1 + HasVEX_4V + HasEVEX_K;
@@ -1051,12 +1114,18 @@ namespace X86II {
case X86II::MRMSrcRegOp4:
case X86II::MRMSrcRegCC:
case X86II::MRMXrCC:
+ case X86II::MRMr0:
case X86II::MRMXr:
case X86II::MRM0r: case X86II::MRM1r:
case X86II::MRM2r: case X86II::MRM3r:
case X86II::MRM4r: case X86II::MRM5r:
case X86II::MRM6r: case X86II::MRM7r:
return -1;
+ case X86II::MRM0X: case X86II::MRM1X:
+ case X86II::MRM2X: case X86II::MRM3X:
+ case X86II::MRM4X: case X86II::MRM5X:
+ case X86II::MRM6X: case X86II::MRM7X:
+ return -1;
case X86II::MRMXmCC:
case X86II::MRMXm:
case X86II::MRM0m: case X86II::MRM1m:
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index bd009da60851..292dd17e2f51 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -317,8 +317,10 @@ static unsigned getRelocType32(MCContext &Ctx,
unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
- MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
MCFixupKind Kind = Fixup.getKind();
+ if (Kind >= FirstLiteralRelocationKind)
+ return Kind - FirstLiteralRelocationKind;
+ MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
if (getEMachine() == ELF::EM_X86_64)
return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 73b1969b4e82..b51011e2c52f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -15,7 +15,7 @@
#include "X86ATTInstPrinter.h"
#include "X86BaseInfo.h"
#include "X86MCTargetDesc.h"
-#include "Utils/X86ShuffleDecode.h"
+#include "X86ShuffleDecode.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/Support/raw_ostream.h"
@@ -199,6 +199,40 @@ using namespace llvm;
CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int) \
CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
+#define CASE_FMA4(Inst, suf) \
+ CASE_AVX_INS_COMMON(Inst, 4, suf) \
+ CASE_AVX_INS_COMMON(Inst, 4Y, suf)
+
+#define CASE_FMA4_PACKED_RR(Inst) \
+ CASE_FMA4(Inst##PD, rr) \
+ CASE_FMA4(Inst##PS, rr)
+
+#define CASE_FMA4_PACKED_RM(Inst) \
+ CASE_FMA4(Inst##PD, rm) \
+ CASE_FMA4(Inst##PS, rm)
+
+#define CASE_FMA4_PACKED_MR(Inst) \
+ CASE_FMA4(Inst##PD, mr) \
+ CASE_FMA4(Inst##PS, mr)
+
+#define CASE_FMA4_SCALAR_RR(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rr) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rr) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rr_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rr_Int)
+
+#define CASE_FMA4_SCALAR_RM(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rm) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rm) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , rm_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , rm_Int)
+
+#define CASE_FMA4_SCALAR_MR(Inst) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , mr) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , mr) \
+ CASE_AVX_INS_COMMON(Inst##SD4, , mr_Int) \
+ CASE_AVX_INS_COMMON(Inst##SS4, , mr_Int)
+
static unsigned getVectorRegSize(unsigned RegNo) {
if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
return 512;
@@ -247,14 +281,15 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
OS << " {z}";
}
-static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
+static bool printFMAComments(const MCInst *MI, raw_ostream &OS,
+ const MCInstrInfo &MCII) {
const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr;
unsigned NumOperands = MI->getNumOperands();
bool RegForm = false;
bool Negate = false;
StringRef AccStr = "+";
- // The operands for FMA instructions without rounding fall into two forms.
+ // The operands for FMA3 instructions without rounding fall into two forms:
// dest, src1, src2, src3
// dest, src1, mask, src2, src3
// Where src3 is either a register or 5 memory address operands. So to find
@@ -262,9 +297,112 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
// index from the end by taking into account memory vs register form when
// finding src2.
+ // The operands for FMA4 instructions:
+ // dest, src1, src2, src3
+ // Where src2 OR src3 are either a register or 5 memory address operands. So
+ // to find dest and src1 we can index from the front, src2 (reg/mem) follows
+ // and then src3 (reg) will be at the end.
+
switch (MI->getOpcode()) {
default:
return false;
+
+ CASE_FMA4_PACKED_RR(FMADD)
+ CASE_FMA4_SCALAR_RR(FMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMADD)
+ CASE_FMA4_SCALAR_RM(FMADD)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+ CASE_FMA4_PACKED_MR(FMADD)
+ CASE_FMA4_SCALAR_MR(FMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ break;
+
+ CASE_FMA4_PACKED_RR(FMSUB)
+ CASE_FMA4_SCALAR_RR(FMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMSUB)
+ CASE_FMA4_SCALAR_RM(FMSUB)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+ CASE_FMA4_PACKED_MR(FMSUB)
+ CASE_FMA4_SCALAR_MR(FMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ break;
+
+ CASE_FMA4_PACKED_RR(FNMADD)
+ CASE_FMA4_SCALAR_RR(FNMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FNMADD)
+ CASE_FMA4_SCALAR_RM(FNMADD)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+ CASE_FMA4_PACKED_MR(FNMADD)
+ CASE_FMA4_SCALAR_MR(FNMADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ Negate = true;
+ break;
+
+ CASE_FMA4_PACKED_RR(FNMSUB)
+ CASE_FMA4_SCALAR_RR(FNMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FNMSUB)
+ CASE_FMA4_SCALAR_RM(FNMSUB)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+ CASE_FMA4_PACKED_MR(FNMSUB)
+ CASE_FMA4_SCALAR_MR(FNMSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-";
+ Negate = true;
+ break;
+
+ CASE_FMA4_PACKED_RR(FMADDSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMADDSUB)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+ CASE_FMA4_PACKED_MR(FMADDSUB)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "+/-";
+ break;
+
+ CASE_FMA4_PACKED_RR(FMSUBADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_FMA4_PACKED_RM(FMSUBADD)
+ Mul2Name = getRegName(MI->getOperand(2).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+ CASE_FMA4_PACKED_MR(FMSUBADD)
+ AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ Mul1Name = getRegName(MI->getOperand(1).getReg());
+ AccStr = "-/+";
+ break;
+
CASE_FMA_PACKED_REG(FMADD132)
CASE_FMA_SCALAR_REG(FMADD132)
Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
@@ -476,8 +614,9 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
if (!Mul2Name) Mul2Name = "mem";
if (!AccName) AccName = "mem";
- OS << DestName << " = ";
- // TODO: Print masking information?
+ OS << DestName;
+ printMasking(OS, MI, MCII);
+ OS << " = ";
if (Negate)
OS << '-';
@@ -504,7 +643,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
unsigned NumOperands = MI->getNumOperands();
bool RegForm = false;
- if (printFMA3Comments(MI, OS))
+ if (printFMAComments(MI, OS, MCII))
return true;
switch (MI->getOpcode()) {
@@ -669,14 +808,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PSLLDQri:
case X86::VPSLLDQri:
case X86::VPSLLDQYri:
- case X86::VPSLLDQZ128rr:
- case X86::VPSLLDQZ256rr:
- case X86::VPSLLDQZrr:
+ case X86::VPSLLDQZ128ri:
+ case X86::VPSLLDQZ256ri:
+ case X86::VPSLLDQZri:
Src1Name = getRegName(MI->getOperand(1).getReg());
LLVM_FALLTHROUGH;
- case X86::VPSLLDQZ128rm:
- case X86::VPSLLDQZ256rm:
- case X86::VPSLLDQZrm:
+ case X86::VPSLLDQZ128mi:
+ case X86::VPSLLDQZ256mi:
+ case X86::VPSLLDQZmi:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0),
@@ -687,14 +826,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::PSRLDQri:
case X86::VPSRLDQri:
case X86::VPSRLDQYri:
- case X86::VPSRLDQZ128rr:
- case X86::VPSRLDQZ256rr:
- case X86::VPSRLDQZrr:
+ case X86::VPSRLDQZ128ri:
+ case X86::VPSRLDQZ256ri:
+ case X86::VPSRLDQZri:
Src1Name = getRegName(MI->getOperand(1).getReg());
LLVM_FALLTHROUGH;
- case X86::VPSRLDQZ128rm:
- case X86::VPSRLDQZ256rm:
- case X86::VPSRLDQZrm:
+ case X86::VPSRLDQZ128mi:
+ case X86::VPSRLDQZ256mi:
+ case X86::VPSRLDQZmi:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0),
@@ -1178,28 +1317,28 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeSubVectorBroadcast(16, 8, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
- CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rr)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
- CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rm)
DecodeSubVectorBroadcast(4, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
- CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
- CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rr)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rr)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
- CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
- CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rm)
DecodeSubVectorBroadcast(8, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
- CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
- CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rr)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rr)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
LLVM_FALLTHROUGH;
- CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
- CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
+ CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rm)
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rm)
DecodeSubVectorBroadcast(16, 2, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index a21555076976..33d70fdb1214 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -13,6 +13,7 @@
#include "X86InstPrinterCommon.h"
#include "X86BaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
@@ -287,16 +288,23 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
}
}
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value (e.g. for jumps and calls). In
-/// Intel-style these print slightly differently than normal immediates.
-/// for example, a $ is not emitted.
-void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
+/// value (e.g. for jumps and calls). In Intel-style these print slightly
+/// differently than normal immediates. For example, a $ is not emitted.
+///
+/// \p Address The address of the next instruction.
+/// \see MCInstPrinter::printInst
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
+ unsigned OpNo, raw_ostream &O) {
const MCOperand &Op = MI->getOperand(OpNo);
- if (Op.isImm())
- O << formatImm(Op.getImm());
- else {
+ if (Op.isImm()) {
+ if (PrintBranchImmAsAddress) {
+ uint64_t Target = Address + Op.getImm();
+ if (MAI.getCodePointerSize() == 4)
+ Target &= 0xffffffff;
+ O << formatHex(Target);
+ } else
+ O << formatImm(Op.getImm());
+ } else {
assert(Op.isExpr() && "unknown pcrel immediate operand");
// If a symbolic branch target was added as a constant expression then print
// that address in hex.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 8e28f24b619a..bb12ede3b729 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -29,7 +29,9 @@ public:
void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS);
void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS);
void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
- void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+ void printPCRelImm(const MCInst *MI, uint64_t Address, unsigned OpNo,
+ raw_ostream &O);
+
protected:
void printInstFlags(const MCInst *MI, raw_ostream &O);
void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index f4bb0fbf62cd..d1eb4d09851d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -45,8 +45,7 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address,
if (MI->getOpcode() == X86::DATA16_PREFIX &&
STI.getFeatureBits()[X86::Mode16Bit]) {
OS << "\tdata32";
- } else if (!printAliasInstr(MI, OS) &&
- !printVecCompareInstr(MI, OS))
+ } else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
printInstruction(MI, Address, OS);
// Next always print the annotation.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index b409b20cbea8..82baf611df03 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -31,9 +31,10 @@ public:
// Autogenerated by tblgen, returns true if we successfully printed an
// alias.
- bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
- void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
- unsigned PrintMethodIdx, raw_ostream &O);
+ bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+ void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+ unsigned OpIdx, unsigned PrintMethodIdx,
+ raw_ostream &O);
// Autogenerated by tblgen.
void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
@@ -47,14 +48,6 @@ public:
void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
- void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
-
- void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
- printMemReference(MI, OpNo, O);
- }
-
void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
O << "byte ptr ";
printMemReference(MI, OpNo, O);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index d986c829d98e..c294da6baffa 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -71,8 +71,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
// (actually, must, since otherwise the non-extern relocations we produce
// overwhelm ld64's tiny little mind and it fails).
DwarfFDESymbolsUseAbsDiff = true;
-
- UseIntegratedAssembler = true;
}
X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
@@ -102,10 +100,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
// Exceptions handling
ExceptionsType = ExceptionHandling::DwarfCFI;
-
- // Always enable the integrated assembler by default.
- // Clang also enabled it when the OS is Solaris but that is redundant here.
- UseIntegratedAssembler = true;
}
const MCExpr *
@@ -141,8 +135,16 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
TextAlignFillValue = 0x90;
AllowAtInName = true;
+}
- UseIntegratedAssembler = true;
+void X86MCAsmInfoMicrosoftMASM::anchor() { }
+
+X86MCAsmInfoMicrosoftMASM::X86MCAsmInfoMicrosoftMASM(const Triple &Triple)
+ : X86MCAsmInfoMicrosoft(Triple) {
+ DollarIsPC = true;
+ SeparatorString = "\n";
+ CommentString = ";";
+ AllowSymbolAtNameStart = true;
}
void X86MCAsmInfoGNUCOFF::anchor() { }
@@ -164,6 +166,4 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
TextAlignFillValue = 0x90;
AllowAtInName = true;
-
- UseIntegratedAssembler = true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index b2369647a40f..ce8e84fb96b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -13,7 +13,6 @@
#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
-#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAsmInfoCOFF.h"
#include "llvm/MC/MCAsmInfoDarwin.h"
#include "llvm/MC/MCAsmInfoELF.h"
@@ -49,6 +48,13 @@ public:
explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
};
+class X86MCAsmInfoMicrosoftMASM : public X86MCAsmInfoMicrosoft {
+ void anchor() override;
+
+public:
+ explicit X86MCAsmInfoMicrosoftMASM(const Triple &Triple);
+};
+
class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
void anchor() override;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 54a293702bd0..7dea0760a831 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -55,83 +55,64 @@ public:
const MCSubtargetInfo &STI) const override;
private:
- unsigned getX86RegNum(const MCOperand &MO) const {
- return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
- }
+ unsigned getX86RegNum(const MCOperand &MO) const;
- unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
- return Ctx.getRegisterInfo()->getEncodingValue(
- MI.getOperand(OpNum).getReg());
- }
+ unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const;
/// \param MI a single low-level machine instruction.
/// \param OpNum the operand #.
/// \returns true if the OpNumth operand of MI require a bit to be set in
/// REX prefix.
- bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
- return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
- }
-
- void emitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
- OS << (char)C;
- ++CurByte;
- }
-
- void emitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
- raw_ostream &OS) const {
- // Output the constant in little endian byte order.
- for (unsigned i = 0; i != Size; ++i) {
- emitByte(Val & 255, CurByte, OS);
- Val >>= 8;
- }
- }
+ bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const;
void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize,
- MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+ MCFixupKind FixupKind, uint64_t StartByte, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
- static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
- assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
- return RM | (RegOpcode << 3) | (Mod << 6);
- }
-
void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
- unsigned &CurByte, raw_ostream &OS) const {
- emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), CurByte, OS);
- }
+ raw_ostream &OS) const;
void emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
- unsigned &CurByte, raw_ostream &OS) const {
- // SIB byte is in the same format as the modRMByte.
- emitByte(modRMByte(SS, Index, Base), CurByte, OS);
- }
+ raw_ostream &OS) const;
void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
- uint64_t TSFlags, bool Rex, unsigned &CurByte,
+ uint64_t TSFlags, bool HasREX, uint64_t StartByte,
raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
+ const MCSubtargetInfo &STI,
+ bool ForceSIB = false) const;
- void emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, unsigned &CurByte,
- bool &Rex, const MCInst &MI, const MCInstrDesc &Desc,
- const MCSubtargetInfo &STI, raw_ostream &OS) const;
+ bool emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
- void emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
- const MCInst &MI, const MCInstrDesc &Desc,
+ void emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
raw_ostream &OS) const;
- void emitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
- const MCInst &MI, raw_ostream &OS) const;
+ void emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI,
+ raw_ostream &OS) const;
- bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
- const MCInst &MI, const MCInstrDesc &Desc,
+ bool emitOpcodePrefix(int MemOperand, const MCInst &MI,
const MCSubtargetInfo &STI, raw_ostream &OS) const;
- uint8_t determineREXPrefix(const MCInst &MI, uint64_t TSFlags, int MemOperand,
- const MCInstrDesc &Desc) const;
+ bool emitREXPrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const;
};
} // end anonymous namespace
+static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+static void emitByte(uint8_t C, raw_ostream &OS) { OS << static_cast<char>(C); }
+
+static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) {
+ // Output the constant in little endian byte order.
+ for (unsigned i = 0; i != Size; ++i) {
+ emitByte(Val & 255, OS);
+ Val >>= 8;
+ }
+}
+
/// \returns true if this signed displacement fits in a 8-bit sign-extended
/// field.
static bool isDisp8(int Value) { return Value == (int8_t)Value; }
@@ -275,7 +256,8 @@ static bool hasSecRelSymbolRef(const MCExpr *Expr) {
static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MCII.get(Opcode);
- if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) ||
+ if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4 &&
+ Opcode != X86::JCC_4) ||
getImmFixupKind(Desc.TSFlags) != FK_PCRel_4)
return false;
@@ -288,9 +270,27 @@ static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None;
}
+unsigned X86MCCodeEmitter::getX86RegNum(const MCOperand &MO) const {
+ return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+}
+
+unsigned X86MCCodeEmitter::getX86RegEncoding(const MCInst &MI,
+ unsigned OpNum) const {
+ return Ctx.getRegisterInfo()->getEncodingValue(MI.getOperand(OpNum).getReg());
+}
+
+/// \param MI a single low-level machine instruction.
+/// \param OpNum the operand #.
+/// \returns true if the OpNumth operand of MI require a bit to be set in
+/// REX prefix.
+bool X86MCCodeEmitter::isREXExtendedReg(const MCInst &MI,
+ unsigned OpNum) const {
+ return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
+}
+
void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
unsigned Size, MCFixupKind FixupKind,
- unsigned &CurByte, raw_ostream &OS,
+ uint64_t StartByte, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
int ImmOffset) const {
const MCExpr *Expr = nullptr;
@@ -299,7 +299,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
// relocation, emit it now.
if (FixupKind != FK_PCRel_1 && FixupKind != FK_PCRel_2 &&
FixupKind != FK_PCRel_4) {
- emitConstant(DispOp.getImm() + ImmOffset, Size, CurByte, OS);
+ emitConstant(DispOp.getImm() + ImmOffset, Size, OS);
return;
}
Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
@@ -322,7 +322,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
}
if (Kind == GOT_Normal)
- ImmOffset = CurByte;
+ ImmOffset = static_cast<int>(OS.tell() - StartByte);
} else if (Expr->getKind() == MCExpr::SymbolRef) {
if (hasSecRelSymbolRef(Expr)) {
FixupKind = MCFixupKind(FK_SecRel_4);
@@ -361,16 +361,30 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
Ctx);
// Emit a symbolic constant as a fixup and 4 zeros.
- Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
- emitConstant(0, Size, CurByte, OS);
+ Fixups.push_back(MCFixup::create(static_cast<uint32_t>(OS.tell() - StartByte),
+ Expr, FixupKind, Loc));
+ emitConstant(0, Size, OS);
+}
+
+void X86MCCodeEmitter::emitRegModRMByte(const MCOperand &ModRMReg,
+ unsigned RegOpcodeFld,
+ raw_ostream &OS) const {
+ emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), OS);
+}
+
+void X86MCCodeEmitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ raw_ostream &OS) const {
+ // SIB byte is in the same format as the modRMByte.
+ emitByte(modRMByte(SS, Index, Base), OS);
}
void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
unsigned RegOpcodeField,
- uint64_t TSFlags, bool Rex,
- unsigned &CurByte, raw_ostream &OS,
+ uint64_t TSFlags, bool HasREX,
+ uint64_t StartByte, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+ const MCSubtargetInfo &STI,
+ bool ForceSIB) const {
const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt);
@@ -383,8 +397,9 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
assert(STI.hasFeature(X86::Mode64Bit) &&
"Rip-relative addressing requires 64-bit mode");
- assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
- emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
+ assert(IndexReg.getReg() == 0 && !ForceSIB &&
+ "Invalid rip-relative address");
+ emitByte(modRMByte(0, RegOpcodeField, 5), OS);
unsigned Opcode = MI.getOpcode();
// movq loads are handled with a special relocation form which allows the
@@ -395,7 +410,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
default:
return X86::reloc_riprel_4byte;
case X86::MOV64rm:
- assert(Rex);
+ assert(HasREX);
return X86::reloc_riprel_4byte_movq_load;
case X86::CALL64m:
case X86::JMP64m:
@@ -409,8 +424,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
case X86::SBB64rm:
case X86::SUB64rm:
case X86::XOR64rm:
- return Rex ? X86::reloc_riprel_4byte_relax_rex
- : X86::reloc_riprel_4byte_relax;
+ return HasREX ? X86::reloc_riprel_4byte_relax_rex
+ : X86::reloc_riprel_4byte_relax;
}
}();
@@ -425,7 +440,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
? X86II::getSizeOfImm(TSFlags)
: 0;
- emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
Fixups, -ImmSize);
return;
}
@@ -472,23 +487,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
if (Disp.isImm() && isDisp8(Disp.getImm())) {
if (Disp.getImm() == 0 && RMfield != 6) {
// There is no displacement; just the register.
- emitByte(modRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, RMfield), OS);
return;
}
// Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
- emitByte(modRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
- emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ emitByte(modRMByte(1, RegOpcodeField, RMfield), OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
return;
}
// This is the [REG]+disp16 case.
- emitByte(modRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, RMfield), OS);
} else {
// There is no BaseReg; this is the plain [disp16] case.
- emitByte(modRMByte(0, RegOpcodeField, 6), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, 6), OS);
}
// Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
- emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+ emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, StartByte, OS, Fixups);
return;
}
@@ -498,7 +513,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// 2-7) and absolute references.
if ( // The SIB byte must be used if there is an index register.
- IndexReg.getReg() == 0 &&
+ !ForceSIB && IndexReg.getReg() == 0 &&
// The SIB byte must be used if the base is ESP/RSP/R12, all of which
// encode to an R/M value of 4, which indicates that a SIB byte is
// present.
@@ -508,8 +523,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
(!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) {
if (BaseReg == 0) { // [disp32] in X86-32 mode
- emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
- emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+ emitByte(modRMByte(0, RegOpcodeField, 5), OS);
+ emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, StartByte, OS, Fixups);
return;
}
@@ -519,7 +534,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// by emitting a displacement of 0 below.
if (BaseRegNo != N86::EBP) {
if (Disp.isImm() && Disp.getImm() == 0) {
- emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
return;
}
@@ -530,7 +545,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// This is exclusively used by call *a@tlscall(base). The relocation
// (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning.
Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc()));
- emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
return;
}
}
@@ -539,27 +554,27 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
if (Disp.isImm()) {
if (!HasEVEX && isDisp8(Disp.getImm())) {
- emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
- emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
return;
}
// Try EVEX compressed 8-bit displacement first; if failed, fall back to
// 32-bit displacement.
int CDisp8 = 0;
if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
- emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
- emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
CDisp8 - Disp.getImm());
return;
}
}
// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
- emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS);
unsigned Opcode = MI.getOpcode();
unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
: X86::reloc_signed_4byte;
- emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
Fixups);
return;
}
@@ -575,30 +590,30 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
if (BaseReg == 0) {
// If there is no base register, we emit the special case SIB byte with
// MOD=0, BASE=5, to JUST get the index, scale, and displacement.
- emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, 4), OS);
ForceDisp32 = true;
} else if (!Disp.isImm()) {
// Emit the normal disp32 encoding.
- emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, 4), OS);
ForceDisp32 = true;
} else if (Disp.getImm() == 0 &&
// Base reg can't be anything that ends up with '5' as the base
// reg, it is the magic [*] nomenclature that indicates no base.
BaseRegNo != N86::EBP) {
// Emit no displacement ModR/M byte
- emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, 4), OS);
} else if (!HasEVEX && isDisp8(Disp.getImm())) {
// Emit the disp8 encoding.
- emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(1, RegOpcodeField, 4), OS);
ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
} else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
// Emit the disp8 encoding.
- emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(1, RegOpcodeField, 4), OS);
ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
ImmOffset = CDisp8 - Disp.getImm();
} else {
// Emit the normal disp32 encoding.
- emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, 4), OS);
}
// Calculate what the SS field value should be...
@@ -613,77 +628,78 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
IndexRegNo = getX86RegNum(IndexReg);
else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
IndexRegNo = 4;
- emitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+ emitSIBByte(SS, IndexRegNo, 5, OS);
} else {
unsigned IndexRegNo;
if (IndexReg.getReg())
IndexRegNo = getX86RegNum(IndexReg);
else
IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
- emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), CurByte, OS);
+ emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS);
}
// Do we need to output a displacement?
if (ForceDisp8)
- emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
ImmOffset);
else if (ForceDisp32 || Disp.getImm() != 0)
emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
- CurByte, OS, Fixups);
+ StartByte, OS, Fixups);
}
-void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
- unsigned &CurByte, bool &Rex,
- const MCInst &MI, const MCInstrDesc &Desc,
- const MCSubtargetInfo &STI,
- raw_ostream &OS) const {
+/// Emit all instruction prefixes.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ uint64_t TSFlags = MCII.get(MI.getOpcode()).TSFlags;
// Determine where the memory operand starts, if present.
int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
- if (MemoryOperand != -1)
- MemoryOperand += CurOp;
-
// Emit segment override opcode prefix as needed.
- if (MemoryOperand >= 0)
- emitSegmentOverridePrefix(CurByte, MemoryOperand + X86::AddrSegmentReg, MI,
- OS);
+ if (MemoryOperand != -1) {
+ MemoryOperand += CurOp;
+ emitSegmentOverridePrefix(MemoryOperand + X86::AddrSegmentReg, MI, OS);
+ }
// Emit the repeat opcode prefix as needed.
unsigned Flags = MI.getFlags();
if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT)
- emitByte(0xF3, CurByte, OS);
+ emitByte(0xF3, OS);
if (Flags & X86::IP_HAS_REPEAT_NE)
- emitByte(0xF2, CurByte, OS);
+ emitByte(0xF2, OS);
// Emit the address size opcode prefix as needed.
- bool need_address_override;
+ bool NeedAddressOverride;
uint64_t AdSize = TSFlags & X86II::AdSizeMask;
if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) ||
(STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) ||
(STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) {
- need_address_override = true;
+ NeedAddressOverride = true;
} else if (MemoryOperand < 0) {
- need_address_override = false;
+ NeedAddressOverride = false;
} else if (STI.hasFeature(X86::Mode64Bit)) {
assert(!is16BitMemOperand(MI, MemoryOperand, STI));
- need_address_override = is32BitMemOperand(MI, MemoryOperand);
+ NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand);
} else if (STI.hasFeature(X86::Mode32Bit)) {
assert(!is64BitMemOperand(MI, MemoryOperand));
- need_address_override = is16BitMemOperand(MI, MemoryOperand, STI);
+ NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI);
} else {
assert(STI.hasFeature(X86::Mode16Bit));
assert(!is64BitMemOperand(MI, MemoryOperand));
- need_address_override = !is16BitMemOperand(MI, MemoryOperand, STI);
+ NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI);
}
- if (need_address_override)
- emitByte(0x67, CurByte, OS);
+ if (NeedAddressOverride)
+ emitByte(0x67, OS);
// Encoding type for this instruction.
uint64_t Encoding = TSFlags & X86II::EncodingMask;
- if (Encoding == 0)
- Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+ bool HasREX = false;
+ if (Encoding)
+ emitVEXOpcodePrefix(MemoryOperand, MI, OS);
else
- emitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+ HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
uint64_t Form = TSFlags & X86II::FormMask;
switch (Form) {
@@ -697,11 +713,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
"SI and DI register sizes do not match");
// Emit segment override opcode prefix as needed (not for %ds).
if (MI.getOperand(2).getReg() != X86::DS)
- emitSegmentOverridePrefix(CurByte, 2, MI, OS);
+ emitSegmentOverridePrefix(2, MI, OS);
// Emit AdSize prefix as needed.
if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
(STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
- emitByte(0x67, CurByte, OS);
+ emitByte(0x67, OS);
CurOp += 3; // Consume operands.
break;
}
@@ -709,11 +725,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
unsigned siReg = MI.getOperand(0).getReg();
// Emit segment override opcode prefix as needed (not for %ds).
if (MI.getOperand(1).getReg() != X86::DS)
- emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ emitSegmentOverridePrefix(1, MI, OS);
// Emit AdSize prefix as needed.
if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
(STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
- emitByte(0x67, CurByte, OS);
+ emitByte(0x67, OS);
CurOp += 2; // Consume operands.
break;
}
@@ -722,24 +738,26 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
// Emit AdSize prefix as needed.
if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) ||
(STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI))
- emitByte(0x67, CurByte, OS);
+ emitByte(0x67, OS);
++CurOp; // Consume operand.
break;
}
case X86II::RawFrmMemOffs: {
// Emit segment override opcode prefix as needed.
- emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ emitSegmentOverridePrefix(1, MI, OS);
break;
}
}
+
+ return HasREX;
}
-/// emitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
-/// called VEX.
-void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
- int MemOperand, const MCInst &MI,
- const MCInstrDesc &Desc,
+/// AVX instructions are encoded using a opcode prefix called VEX.
+void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
raw_ostream &OS) const {
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
uint64_t Encoding = TSFlags & X86II::EncodingMask;
@@ -868,8 +886,11 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
switch (TSFlags & X86II::FormMask) {
default:
llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
+ case X86II::MRM_C0:
case X86II::RawFrm:
+ case X86II::PrefixByte:
break;
+ case X86II::MRMDestMemFSIB:
case X86II::MRMDestMem: {
// MRMDestMem instructions forms:
// MemAddr, src1(ModR/M)
@@ -900,6 +921,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
EVEX_R2 = ~(RegEnc >> 4) & 1;
break;
}
+ case X86II::MRMSrcMemFSIB:
case X86II::MRMSrcMem: {
// MRMSrcMem instructions forms:
// src1(ModR/M), MemAddr
@@ -1081,6 +1103,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
EncodeRC = true;
break;
}
+ case X86II::MRMr0: {
+ // MRMr0 instructions forms:
+ // 11:rrr:000
+ // dst(ModR/M)
+ unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+ VEX_R = ~(RegEnc >> 3) & 1;
+ EVEX_R2 = ~(RegEnc >> 4) & 1;
+ break;
+ }
case X86II::MRM0r:
case X86II::MRM1r:
case X86II::MRM2r:
@@ -1127,15 +1158,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// Can we use the 2 byte VEX prefix?
if (!(MI.getFlags() & X86::IP_USE_VEX3) && Encoding == X86II::VEX &&
VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
- emitByte(0xC5, CurByte, OS);
- emitByte(LastByte | (VEX_R << 7), CurByte, OS);
+ emitByte(0xC5, OS);
+ emitByte(LastByte | (VEX_R << 7), OS);
return;
}
// 3 byte VEX prefix
- emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
- emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
- emitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, OS);
+ emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, OS);
+ emitByte(LastByte | (VEX_W << 7), OS);
} else {
assert(Encoding == X86II::EVEX && "unknown encoding!");
// EVEX opcode prefix can have 4 bytes
@@ -1146,144 +1177,137 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
assert((VEX_5M & 0x3) == VEX_5M &&
"More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
- emitByte(0x62, CurByte, OS);
+ emitByte(0x62, OS);
emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) |
VEX_5M,
- CurByte, OS);
- emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, CurByte,
OS);
+ emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, OS);
if (EncodeRC)
emitByte((EVEX_z << 7) | (EVEX_rc << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) |
EVEX_aaa,
- CurByte, OS);
+ OS);
else
emitByte((EVEX_z << 7) | (EVEX_L2 << 6) | (VEX_L << 5) | (EVEX_b << 4) |
(EVEX_V2 << 3) | EVEX_aaa,
- CurByte, OS);
+ OS);
}
}
-/// Determine if the MCInst has to be encoded with a X86-64 REX prefix which
-/// specifies 1) 64-bit instructions, 2) non-default operand size, and 3) use
-/// of X86-64 extended registers.
-uint8_t X86MCCodeEmitter::determineREXPrefix(const MCInst &MI, uint64_t TSFlags,
- int MemOperand,
- const MCInstrDesc &Desc) const {
- uint8_t REX = 0;
- bool UsesHighByteReg = false;
-
- if (TSFlags & X86II::REX_W)
- REX |= 1 << 3; // set REX.W
+/// Emit REX prefix which specifies
+/// 1) 64-bit instructions,
+/// 2) non-default operand size, and
+/// 3) use of X86-64 extended registers.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
+ raw_ostream &OS) const {
+ uint8_t REX = [&, MemOperand]() {
+ uint8_t REX = 0;
+ bool UsesHighByteReg = false;
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
+ if (TSFlags & X86II::REX_W)
+ REX |= 1 << 3; // set REX.W
+
+ if (MI.getNumOperands() == 0)
+ return REX;
+
+ unsigned NumOps = MI.getNumOperands();
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ for (unsigned i = CurOp; i != NumOps; ++i) {
+ const MCOperand &MO = MI.getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+ UsesHighByteReg = true;
+ if (X86II::isX86_64NonExtLowByteReg(Reg))
+ // FIXME: The caller of determineREXPrefix slaps this prefix onto
+ // anything that returns non-zero.
+ REX |= 0x40; // REX fixed encoding prefix
+ }
- if (MI.getNumOperands() == 0)
+ switch (TSFlags & X86II::FormMask) {
+ case X86II::AddRegFrm:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMSrcReg:
+ case X86II::MRMSrcRegCC:
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMSrcMem:
+ case X86II::MRMSrcMemCC:
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+ CurOp += X86::AddrNumOperands;
+ break;
+ case X86II::MRMDestReg:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMDestMem:
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+ CurOp += X86::AddrNumOperands;
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMXmCC:
+ case X86II::MRMXm:
+ case X86II::MRM0m:
+ case X86II::MRM1m:
+ case X86II::MRM2m:
+ case X86II::MRM3m:
+ case X86II::MRM4m:
+ case X86II::MRM5m:
+ case X86II::MRM6m:
+ case X86II::MRM7m:
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+ break;
+ case X86II::MRMXrCC:
+ case X86II::MRMXr:
+ case X86II::MRM0r:
+ case X86II::MRM1r:
+ case X86II::MRM2r:
+ case X86II::MRM3r:
+ case X86II::MRM4r:
+ case X86II::MRM5r:
+ case X86II::MRM6r:
+ case X86II::MRM7r:
+ REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+ break;
+ case X86II::MRMr0:
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ break;
+ case X86II::MRMDestMemFSIB:
+ llvm_unreachable("FSIB format never need REX prefix!");
+ }
+ if (REX && UsesHighByteReg)
+ report_fatal_error(
+ "Cannot encode high byte register in REX-prefixed instruction");
return REX;
+ }();
- unsigned NumOps = MI.getNumOperands();
- unsigned CurOp = X86II::getOperandBias(Desc);
-
- // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
- for (unsigned i = CurOp; i != NumOps; ++i) {
- const MCOperand &MO = MI.getOperand(i);
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
- if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
- UsesHighByteReg = true;
- if (X86II::isX86_64NonExtLowByteReg(Reg))
- // FIXME: The caller of determineREXPrefix slaps this prefix onto anything
- // that returns non-zero.
- REX |= 0x40; // REX fixed encoding prefix
- }
-
- switch (TSFlags & X86II::FormMask) {
- case X86II::AddRegFrm:
- REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
- break;
- case X86II::MRMSrcReg:
- case X86II::MRMSrcRegCC:
- REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
- REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
- break;
- case X86II::MRMSrcMem:
- case X86II::MRMSrcMemCC:
- REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
- REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
- REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
- CurOp += X86::AddrNumOperands;
- break;
- case X86II::MRMDestReg:
- REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
- REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
- break;
- case X86II::MRMDestMem:
- REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
- REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
- CurOp += X86::AddrNumOperands;
- REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
- break;
- case X86II::MRMXmCC:
- case X86II::MRMXm:
- case X86II::MRM0m:
- case X86II::MRM1m:
- case X86II::MRM2m:
- case X86II::MRM3m:
- case X86II::MRM4m:
- case X86II::MRM5m:
- case X86II::MRM6m:
- case X86II::MRM7m:
- REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
- REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
- break;
- case X86II::MRMXrCC:
- case X86II::MRMXr:
- case X86II::MRM0r:
- case X86II::MRM1r:
- case X86II::MRM2r:
- case X86II::MRM3r:
- case X86II::MRM4r:
- case X86II::MRM5r:
- case X86II::MRM6r:
- case X86II::MRM7r:
- REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
- break;
- }
- if (REX && UsesHighByteReg)
- report_fatal_error(
- "Cannot encode high byte register in REX-prefixed instruction");
+ if (!REX)
+ return false;
- return REX;
+ emitByte(0x40 | REX, OS);
+ return true;
}
/// Emit segment override opcode prefix as needed.
-void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte,
- unsigned SegOperand,
+void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned SegOperand,
const MCInst &MI,
raw_ostream &OS) const {
// Check for explicit segment override on memory operand.
- switch (MI.getOperand(SegOperand).getReg()) {
- default:
- llvm_unreachable("Unknown segment register!");
- case 0:
- break;
- case X86::CS:
- emitByte(0x2E, CurByte, OS);
- break;
- case X86::SS:
- emitByte(0x36, CurByte, OS);
- break;
- case X86::DS:
- emitByte(0x3E, CurByte, OS);
- break;
- case X86::ES:
- emitByte(0x26, CurByte, OS);
- break;
- case X86::FS:
- emitByte(0x64, CurByte, OS);
- break;
- case X86::GS:
- emitByte(0x65, CurByte, OS);
- break;
- }
+ if (unsigned Reg = MI.getOperand(SegOperand).getReg())
+ emitByte(X86::getSegmentOverridePrefixForReg(Reg), OS);
}
/// Emit all instruction prefixes prior to the opcode.
@@ -1291,48 +1315,44 @@ void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte,
/// \param MemOperand the operand # of the start of a memory operand if present.
/// If not present, it is -1.
///
-/// \returns true if a REX prefix was used.
-bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
- int MemOperand, const MCInst &MI,
- const MCInstrDesc &Desc,
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
const MCSubtargetInfo &STI,
raw_ostream &OS) const {
- bool Ret = false;
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint64_t TSFlags = Desc.TSFlags;
+
// Emit the operand size opcode prefix as needed.
if ((TSFlags & X86II::OpSizeMask) ==
(STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16))
- emitByte(0x66, CurByte, OS);
+ emitByte(0x66, OS);
// Emit the LOCK opcode prefix.
if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
- emitByte(0xF0, CurByte, OS);
+ emitByte(0xF0, OS);
// Emit the NOTRACK opcode prefix.
if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK)
- emitByte(0x3E, CurByte, OS);
+ emitByte(0x3E, OS);
switch (TSFlags & X86II::OpPrefixMask) {
case X86II::PD: // 66
- emitByte(0x66, CurByte, OS);
+ emitByte(0x66, OS);
break;
case X86II::XS: // F3
- emitByte(0xF3, CurByte, OS);
+ emitByte(0xF3, OS);
break;
case X86II::XD: // F2
- emitByte(0xF2, CurByte, OS);
+ emitByte(0xF2, OS);
break;
}
// Handle REX prefix.
- // FIXME: Can this come before F2 etc to simplify emission?
- if (STI.hasFeature(X86::Mode64Bit)) {
- if (uint8_t REX = determineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
- emitByte(0x40 | REX, CurByte, OS);
- Ret = true;
- }
- } else {
- assert(!(TSFlags & X86II::REX_W) && "REX.W requires 64bit mode.");
- }
+ assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
+ "REX.W requires 64bit mode.");
+ bool HasREX = STI.hasFeature(X86::Mode64Bit)
+ ? emitREXPrefix(MemOperand, MI, OS)
+ : false;
// 0x0F escape code must be emitted just before the opcode.
switch (TSFlags & X86II::OpMapMask) {
@@ -1340,19 +1360,20 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
case X86II::T8: // 0F 38
case X86II::TA: // 0F 3A
case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller.
- emitByte(0x0F, CurByte, OS);
+ emitByte(0x0F, OS);
break;
}
switch (TSFlags & X86II::OpMapMask) {
case X86II::T8: // 0F 38
- emitByte(0x38, CurByte, OS);
+ emitByte(0x38, OS);
break;
case X86II::TA: // 0F 3A
- emitByte(0x3A, CurByte, OS);
+ emitByte(0x3A, OS);
break;
}
- return Ret;
+
+ return HasREX;
}
void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
@@ -1362,16 +1383,12 @@ void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
uint64_t TSFlags = Desc.TSFlags;
// Pseudo instructions don't get encoded.
- if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+ if (X86II::isPseudo(TSFlags))
return;
unsigned CurOp = X86II::getOperandBias(Desc);
- // Keep track of the current byte being emitted.
- unsigned CurByte = 0;
-
- bool Rex = false;
- emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
+ emitPrefixImpl(CurOp, MI, STI, OS);
}
void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -1382,17 +1399,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
uint64_t TSFlags = Desc.TSFlags;
// Pseudo instructions don't get encoded.
- if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+ if (X86II::isPseudo(TSFlags))
return;
unsigned NumOps = Desc.getNumOperands();
unsigned CurOp = X86II::getOperandBias(Desc);
- // Keep track of the current byte being emitted.
- unsigned CurByte = 0;
+ uint64_t StartByte = OS.tell();
- bool Rex = false;
- emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
+ bool HasREX = emitPrefixImpl(CurOp, MI, STI, OS);
// It uses the VEX.VVVV field?
bool HasVEX_4V = TSFlags & X86II::VEX_4V;
@@ -1422,7 +1437,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::RawFrmDstSrc:
case X86II::RawFrmSrc:
case X86II::RawFrmDst:
- emitByte(BaseOpcode, CurByte, OS);
+ case X86II::PrefixByte:
+ emitByte(BaseOpcode, OS);
break;
case X86II::AddCCFrm: {
// This will be added to the opcode in the fallthrough.
@@ -1431,47 +1447,47 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
--NumOps; // Drop the operand from the end.
LLVM_FALLTHROUGH;
case X86II::RawFrm:
- emitByte(BaseOpcode + OpcodeOffset, CurByte, OS);
+ emitByte(BaseOpcode + OpcodeOffset, OS);
if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII))
break;
const MCOperand &Op = MI.getOperand(CurOp++);
emitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
- MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS,
+ MCFixupKind(X86::reloc_branch_4byte_pcrel), StartByte, OS,
Fixups);
break;
}
case X86II::RawFrmMemOffs:
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
- CurByte, OS, Fixups);
+ StartByte, OS, Fixups);
++CurOp; // skip segment operand
break;
case X86II::RawFrmImm8:
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
- CurByte, OS, Fixups);
- emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+ StartByte, OS, Fixups);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, StartByte,
OS, Fixups);
break;
case X86II::RawFrmImm16:
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
- CurByte, OS, Fixups);
- emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+ StartByte, OS, Fixups);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, StartByte,
OS, Fixups);
break;
case X86II::AddRegFrm:
- emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+ emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), OS);
break;
case X86II::MRMDestReg: {
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
unsigned SrcRegNum = CurOp + 1;
if (HasEVEX_K) // Skip writemask
@@ -1481,12 +1497,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
++SrcRegNum;
emitRegModRMByte(MI.getOperand(CurOp),
- getX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+ getX86RegNum(MI.getOperand(SrcRegNum)), OS);
CurOp = SrcRegNum + 1;
break;
}
+ case X86II::MRMDestMemFSIB:
case X86II::MRMDestMem: {
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
if (HasEVEX_K) // Skip writemask
@@ -1495,13 +1512,14 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
+ bool ForceSIB = (Form == X86II::MRMDestMemFSIB);
emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
- Rex, CurByte, OS, Fixups, STI);
+ HasREX, StartByte, OS, Fixups, STI, ForceSIB);
CurOp = SrcRegNum + 1;
break;
}
case X86II::MRMSrcReg: {
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
unsigned SrcRegNum = CurOp + 1;
if (HasEVEX_K) // Skip writemask
@@ -1511,7 +1529,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
++SrcRegNum;
emitRegModRMByte(MI.getOperand(SrcRegNum),
- getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ getX86RegNum(MI.getOperand(CurOp)), OS);
CurOp = SrcRegNum + 1;
if (HasVEX_I8Reg)
I8RegNum = getX86RegEncoding(MI, CurOp++);
@@ -1521,17 +1539,17 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
break;
}
case X86II::MRMSrcReg4VOp3: {
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
unsigned SrcRegNum = CurOp + 1;
emitRegModRMByte(MI.getOperand(SrcRegNum),
- getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ getX86RegNum(MI.getOperand(CurOp)), OS);
CurOp = SrcRegNum + 1;
++CurOp; // Encoded in VEX.VVVV
break;
}
case X86II::MRMSrcRegOp4: {
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
unsigned SrcRegNum = CurOp + 1;
// Skip 1st src (which is encoded in VEX_VVVV)
@@ -1542,7 +1560,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
emitRegModRMByte(MI.getOperand(SrcRegNum),
- getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ getX86RegNum(MI.getOperand(CurOp)), OS);
CurOp = SrcRegNum + 1;
break;
}
@@ -1551,12 +1569,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
unsigned SecondOp = CurOp++;
unsigned CC = MI.getOperand(CurOp++).getImm();
- emitByte(BaseOpcode + CC, CurByte, OS);
+ emitByte(BaseOpcode + CC, OS);
emitRegModRMByte(MI.getOperand(SecondOp),
- getX86RegNum(MI.getOperand(FirstOp)), CurByte, OS);
+ getX86RegNum(MI.getOperand(FirstOp)), OS);
break;
}
+ case X86II::MRMSrcMemFSIB:
case X86II::MRMSrcMem: {
unsigned FirstMemOp = CurOp + 1;
@@ -1566,10 +1585,11 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V)
++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
+ bool ForceSIB = (Form == X86II::MRMSrcMemFSIB);
emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
- TSFlags, Rex, CurByte, OS, Fixups, STI);
+ TSFlags, HasREX, StartByte, OS, Fixups, STI, ForceSIB);
CurOp = FirstMemOp + X86::AddrNumOperands;
if (HasVEX_I8Reg)
I8RegNum = getX86RegEncoding(MI, CurOp++);
@@ -1578,10 +1598,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRMSrcMem4VOp3: {
unsigned FirstMemOp = CurOp + 1;
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
- TSFlags, Rex, CurByte, OS, Fixups, STI);
+ TSFlags, HasREX, StartByte, OS, Fixups, STI);
CurOp = FirstMemOp + X86::AddrNumOperands;
++CurOp; // Encoded in VEX.VVVV.
break;
@@ -1595,10 +1615,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
- TSFlags, Rex, CurByte, OS, Fixups, STI);
+ TSFlags, HasREX, StartByte, OS, Fixups, STI);
CurOp = FirstMemOp + X86::AddrNumOperands;
break;
}
@@ -1608,10 +1628,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = FirstMemOp + X86::AddrNumOperands;
unsigned CC = MI.getOperand(CurOp++).getImm();
- emitByte(BaseOpcode + CC, CurByte, OS);
+ emitByte(BaseOpcode + CC, OS);
emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(RegOp)),
- TSFlags, Rex, CurByte, OS, Fixups, STI);
+ TSFlags, HasREX, StartByte, OS, Fixups, STI);
break;
}
@@ -1619,8 +1639,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
unsigned RegOp = CurOp++;
unsigned CC = MI.getOperand(CurOp++).getImm();
- emitByte(BaseOpcode + CC, CurByte, OS);
- emitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS);
+ emitByte(BaseOpcode + CC, OS);
+ emitRegModRMByte(MI.getOperand(RegOp), 0, OS);
break;
}
@@ -1637,10 +1657,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
++CurOp;
if (HasEVEX_K) // Skip writemask
++CurOp;
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
emitRegModRMByte(MI.getOperand(CurOp++),
- (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, CurByte,
- OS);
+ (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, OS);
+ break;
+ case X86II::MRMr0:
+ emitByte(BaseOpcode, OS);
+ emitByte(modRMByte(3, getX86RegNum(MI.getOperand(CurOp++)),0), OS);
break;
case X86II::MRMXmCC: {
@@ -1648,9 +1671,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = FirstMemOp + X86::AddrNumOperands;
unsigned CC = MI.getOperand(CurOp++).getImm();
- emitByte(BaseOpcode + CC, CurByte, OS);
+ emitByte(BaseOpcode + CC, OS);
- emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI);
+ emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, HasREX, StartByte, OS, Fixups,
+ STI);
break;
}
@@ -1667,13 +1691,25 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
++CurOp;
if (HasEVEX_K) // Skip writemask
++CurOp;
- emitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, OS);
emitMemModRMByte(MI, CurOp,
(Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
- Rex, CurByte, OS, Fixups, STI);
+ HasREX, StartByte, OS, Fixups, STI);
CurOp += X86::AddrNumOperands;
break;
+ case X86II::MRM0X:
+ case X86II::MRM1X:
+ case X86II::MRM2X:
+ case X86II::MRM3X:
+ case X86II::MRM4X:
+ case X86II::MRM5X:
+ case X86II::MRM6X:
+ case X86II::MRM7X:
+ emitByte(BaseOpcode, OS);
+ emitByte(0xC0 + ((Form - X86II::MRM0X) << 3), OS);
+ break;
+
case X86II::MRM_C0:
case X86II::MRM_C1:
case X86II::MRM_C2:
@@ -1738,8 +1774,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM_FD:
case X86II::MRM_FE:
case X86II::MRM_FF:
- emitByte(BaseOpcode, CurByte, OS);
- emitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+ emitByte(BaseOpcode, OS);
+ emitByte(0xC0 + Form - X86II::MRM_C0, OS);
break;
}
@@ -1754,7 +1790,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
I8RegNum |= Val;
}
emitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
- CurByte, OS, Fixups);
+ StartByte, OS, Fixups);
} else {
// If there is a remaining operand, it must be a trailing immediate. Emit it
// according to the right size for the instruction. Some instructions
@@ -1762,13 +1798,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
while (CurOp != NumOps && NumOps - CurOp <= 2) {
emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
- CurByte, OS, Fixups);
+ StartByte, OS, Fixups);
}
}
if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
- emitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+ emitByte(X86II::getBaseOpcodeFor(TSFlags), OS);
+ assert(OS.tell() - StartByte <= 15 &&
+ "The size of instruction must be no longer than 15.");
#ifndef NDEBUG
// FIXME: Verify.
if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 049a3a815984..81110ba666e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -30,10 +30,6 @@
#include "llvm/Support/Host.h"
#include "llvm/Support/TargetRegistry.h"
-#if _MSC_VER
-#include <intrin.h>
-#endif
-
using namespace llvm;
#define GET_REGINFO_MC_DESC
@@ -294,7 +290,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
if (!FS.empty())
ArchFS = (Twine(ArchFS) + "," + FS).str();
- std::string CPUName = CPU;
+ std::string CPUName = std::string(CPU);
if (CPUName.empty())
CPUName = "generic";
@@ -335,7 +331,10 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
MAI = new X86ELFMCAsmInfo(TheTriple);
} else if (TheTriple.isWindowsMSVCEnvironment() ||
TheTriple.isWindowsCoreCLREnvironment()) {
- MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+ if (Options.getAssemblyLanguage().equals_lower("masm"))
+ MAI = new X86MCAsmInfoMicrosoftMASM(TheTriple);
+ else
+ MAI = new X86MCAsmInfoMicrosoft(TheTriple);
} else if (TheTriple.isOSCygMing() ||
TheTriple.isWindowsItaniumEnvironment()) {
MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
@@ -350,7 +349,7 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
// Initial state of the frame pointer is esp+stackGrowth.
unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
- MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+ MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
MAI->addInitialFrameState(Inst);
@@ -401,6 +400,9 @@ public:
findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
uint64_t GotSectionVA,
const Triple &TargetTriple) const override;
+
+ bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+ uint64_t &Target) const override;
Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
uint64_t Addr,
uint64_t Size) const override;
@@ -519,6 +521,15 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries(
}
}
+bool X86MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+ uint64_t Size, uint64_t &Target) const {
+ if (Inst.getNumOperands() == 0 ||
+ Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
+ return false;
+ Target = Addr + Size + Inst.getOperand(0).getImm();
+ return true;
+}
+
Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 0c789061f0e1..e8c72be1d9b6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -13,27 +13,28 @@
#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
-#include "llvm/MC/MCRegister.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/DataTypes.h"
+#include <memory>
#include <string>
namespace llvm {
+class formatted_raw_ostream;
class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
+class MCInst;
+class MCInstPrinter;
class MCInstrInfo;
class MCObjectTargetWriter;
class MCObjectWriter;
+class MCRegister;
class MCRegisterInfo;
+class MCStreamer;
class MCSubtargetInfo;
-class MCRelocationInfo;
class MCTargetOptions;
+class MCTargetStreamer;
class Target;
class Triple;
class StringRef;
-class raw_ostream;
-class raw_pwrite_stream;
/// Flavour of dwarf regnumbers
///
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
index 48fd3e0b7ab9..62c1c399a606 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -12,7 +12,9 @@
//===----------------------------------------------------------------------===//
#include "X86ShuffleDecode.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
//===----------------------------------------------------------------------===//
// Vector Mask Decoding
@@ -141,9 +143,6 @@ void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
ShuffleMask.push_back(i + Imm);
}
-/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
unsigned Size = NumElts * ScalarBits;
@@ -197,9 +196,6 @@ void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.push_back(h);
}
-/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
-/// the type of the vector allowing it to handle different datatypes and vector
-/// widths.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
unsigned NumLaneElts = 128 / ScalarBits;
@@ -217,9 +213,6 @@ void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
}
}
-/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
-/// and punpckh*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
SmallVectorImpl<int> &ShuffleMask) {
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -236,9 +229,6 @@ void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
}
}
-/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// and punpckl*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
SmallVectorImpl<int> &ShuffleMask) {
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -255,13 +245,11 @@ void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
}
}
-/// Decodes a broadcast of the first element of a vector.
void DecodeVectorBroadcast(unsigned NumElts,
SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.append(NumElts, 0);
}
-/// Decodes a broadcast of a subvector to a larger vector type.
void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
SmallVectorImpl<int> &ShuffleMask) {
unsigned Scale = DstNumElts / SrcNumElts;
@@ -271,9 +259,6 @@ void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
ShuffleMask.push_back(j);
}
-/// Decode a shuffle packed values at 128-bit granularity
-/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
-/// immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
@@ -374,7 +359,6 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
}
}
-/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
for (unsigned l = 0; l != NumElts; l += 4)
@@ -384,32 +368,31 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
unsigned NumDstElts, bool IsAnyExtend,
- SmallVectorImpl<int> &Mask) {
+ SmallVectorImpl<int> &ShuffleMask) {
unsigned Scale = DstScalarBits / SrcScalarBits;
assert(SrcScalarBits < DstScalarBits &&
"Expected zero extension mask to increase scalar size");
+ int Sentinel = IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero;
for (unsigned i = 0; i != NumDstElts; i++) {
- Mask.push_back(i);
- for (unsigned j = 1; j != Scale; j++)
- Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero);
+ ShuffleMask.push_back(i);
+ ShuffleMask.append(Scale - 1, Sentinel);
}
}
void DecodeZeroMoveLowMask(unsigned NumElts,
SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.push_back(0);
- for (unsigned i = 1; i < NumElts; i++)
- ShuffleMask.push_back(SM_SentinelZero);
+ ShuffleMask.append(NumElts - 1, SM_SentinelZero);
}
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
- SmallVectorImpl<int> &Mask) {
+ SmallVectorImpl<int> &ShuffleMask) {
// First element comes from the first element of second source.
// Remaining elements: Load zero extends / Move copies from first source.
- Mask.push_back(NumElts);
+ ShuffleMask.push_back(NumElts);
for (unsigned i = 1; i < NumElts; i++)
- Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+ ShuffleMask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
}
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
index f52785063071..4ef9959f7a27 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
@@ -14,15 +14,16 @@
#ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
#define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
//===----------------------------------------------------------------------===//
// Vector Mask Decoding
//===----------------------------------------------------------------------===//
namespace llvm {
+class APInt;
template <typename T> class ArrayRef;
+template <typename T> class SmallVectorImpl;
enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
@@ -61,20 +62,14 @@ void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for pshufhw.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for pshuflw.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
@@ -82,20 +77,14 @@ void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for shufp*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
SmallVectorImpl<int> &ShuffleMask);
/// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
SmallVectorImpl<int> &ShuffleMask);
@@ -119,6 +108,7 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
/// immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index db624378d517..3bebcc24fd3a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -28,7 +28,7 @@ public:
void EmitWinEHHandlerData(SMLoc Loc) override;
void EmitWindowsUnwindTables() override;
void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
- void FinishImpl() override;
+ void finishImpl() override;
};
void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
@@ -52,11 +52,11 @@ void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
XTS->emitFPOData(ProcSym, Loc);
}
-void X86WinCOFFStreamer::FinishImpl() {
- EmitFrames(nullptr);
+void X86WinCOFFStreamer::finishImpl() {
+ emitFrames(nullptr);
EmitWindowsUnwindTables();
- MCWinCOFFStreamer::FinishImpl();
+ MCWinCOFFStreamer::finishImpl();
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index d5494ef12370..11251fb2b2ba 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -159,7 +159,7 @@ bool X86WinCOFFTargetStreamer::checkInFPOPrologue(SMLoc L) {
MCSymbol *X86WinCOFFTargetStreamer::emitFPOLabel() {
MCSymbol *Label = getContext().createTempSymbol("cfi", true);
- getStreamer().EmitLabel(Label);
+ getStreamer().emitLabel(Label);
return Label;
}
@@ -372,13 +372,13 @@ void FPOStateMachine::emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label) {
OS.emitAbsoluteSymbolDiff(Label, FPO->Begin, 4); // RvaStart
OS.emitAbsoluteSymbolDiff(FPO->End, Label, 4); // CodeSize
- OS.EmitIntValue(LocalSize, 4);
- OS.EmitIntValue(FPO->ParamsSize, 4);
- OS.EmitIntValue(MaxStackSize, 4);
- OS.EmitIntValue(FrameFuncStrTabOff, 4); // FrameFunc
+ OS.emitInt32(LocalSize);
+ OS.emitInt32(FPO->ParamsSize);
+ OS.emitInt32(MaxStackSize);
+ OS.emitInt32(FrameFuncStrTabOff); // FrameFunc
OS.emitAbsoluteSymbolDiff(FPO->PrologueEnd, Label, 2);
- OS.EmitIntValue(SavedRegSize, 2);
- OS.EmitIntValue(CurFlags, 4);
+ OS.emitInt16(SavedRegSize);
+ OS.emitInt32(CurFlags);
}
/// Compute and emit the real CodeView FrameData subsection.
@@ -398,12 +398,12 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
MCSymbol *FrameBegin = Ctx.createTempSymbol(),
*FrameEnd = Ctx.createTempSymbol();
- OS.EmitIntValue(unsigned(DebugSubsectionKind::FrameData), 4);
+ OS.emitInt32(unsigned(DebugSubsectionKind::FrameData));
OS.emitAbsoluteSymbolDiff(FrameEnd, FrameBegin, 4);
- OS.EmitLabel(FrameBegin);
+ OS.emitLabel(FrameBegin);
// Start with the RVA of the function in question.
- OS.EmitValue(MCSymbolRefExpr::create(FPO->Function,
+ OS.emitValue(MCSymbolRefExpr::create(FPO->Function,
MCSymbolRefExpr::VK_COFF_IMGREL32, Ctx),
4);
@@ -437,8 +437,8 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
FSM.emitFrameDataRecord(OS, Inst.Label);
}
- OS.EmitValueToAlignment(4, 0);
- OS.EmitLabel(FrameEnd);
+ OS.emitValueToAlignment(4, 0);
+ OS.emitLabel(FrameEnd);
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.h b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
index a0ab5c3a5b3c..91ba4e3d091e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.h
@@ -19,9 +19,7 @@
namespace llvm {
class FunctionPass;
-class ImmutablePass;
class InstructionSelector;
-class ModulePass;
class PassRegistry;
class X86RegisterBankInfo;
class X86Subtarget;
@@ -129,14 +127,23 @@ FunctionPass *createX86DiscriminateMemOpsPass();
/// This pass applies profiling information to insert cache prefetches.
FunctionPass *createX86InsertPrefetchPass();
+/// This pass insert wait instruction after X87 instructions which could raise
+/// fp exceptions when strict-fp enabled.
+FunctionPass *createX86InsertX87waitPass();
+
+/// This pass optimizes arithmetic based on knowledge that is only used by
+/// a reduction sequence and is therefore safe to reassociate in interesting
+/// ways.
+FunctionPass *createX86PartialReductionPass();
+
InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &,
X86RegisterBankInfo &);
FunctionPass *createX86LoadValueInjectionLoadHardeningPass();
-FunctionPass *createX86LoadValueInjectionLoadHardeningUnoptimizedPass();
FunctionPass *createX86LoadValueInjectionRetHardeningPass();
FunctionPass *createX86SpeculativeLoadHardeningPass();
+FunctionPass *createX86SpeculativeExecutionSideEffectSuppression();
void initializeEvexToVexInstPassPass(PassRegistry &);
void initializeFixupBWInstPassPass(PassRegistry &);
@@ -144,18 +151,21 @@ void initializeFixupLEAPassPass(PassRegistry &);
void initializeFPSPass(PassRegistry &);
void initializeWinEHStatePassPass(PassRegistry &);
void initializeX86AvoidSFBPassPass(PassRegistry &);
+void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
void initializeX86CallFrameOptimizationPass(PassRegistry &);
void initializeX86CmovConverterPassPass(PassRegistry &);
void initializeX86CondBrFoldingPassPass(PassRegistry &);
void initializeX86DomainReassignmentPass(PassRegistry &);
void initializeX86ExecutionDomainFixPass(PassRegistry &);
void initializeX86ExpandPseudoPass(PassRegistry &);
+void initializeX86FixupSetCCPassPass(PassRegistry &);
void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
-void initializeX86LoadValueInjectionLoadHardeningUnoptimizedPassPass(PassRegistry &);
void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
void initializeX86OptimizeLEAPassPass(PassRegistry &);
+void initializeX86PartialReductionPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
namespace X86AS {
enum : unsigned {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.td b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
index bb8952f54e3a..dc1ff72add49 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.td
@@ -52,13 +52,16 @@ def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
"Support xsave instructions">;
def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
- "Support xsaveopt instructions">;
+ "Support xsaveopt instructions",
+ [FeatureXSAVE]>;
def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
- "Support xsavec instructions">;
+ "Support xsavec instructions",
+ [FeatureXSAVE]>;
def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
- "Support xsaves instructions">;
+ "Support xsaves instructions",
+ [FeatureXSAVE]>;
def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
"Enable SSE instructions">;
@@ -246,6 +249,14 @@ def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
// target-feature attribute.
def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
"Deprecated. Support MPX instructions">;
+def FeatureAMXTILE : SubtargetFeature<"amx-tile", "HasAMXTILE", "true",
+ "Support AMX-TILE instructions">;
+def FeatureAMXINT8 : SubtargetFeature<"amx-int8", "HasAMXINT8", "true",
+ "Support AMX-INT8 instructions",
+ [FeatureAMXTILE]>;
+def FeatureAMXBF16 : SubtargetFeature<"amx-bf16", "HasAMXBF16", "true",
+ "Support AMX-BF16 instructions",
+ [FeatureAMXTILE]>;
def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
@@ -273,6 +284,10 @@ def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
"Wait and pause enhancements">;
def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
"Has ENQCMD instructions">;
+def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true",
+ "Has serialize instruction">;
+def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true",
+ "Support TSXLDTRK instructions">;
// On some processors, instructions that implicitly take two memory operands are
// slow. In practice, this means that CALL, PUSH, and POP with memory operands
// should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -329,6 +344,11 @@ def FeatureFastLZCNT
: SubtargetFeature<
"fast-lzcnt", "HasFastLZCNT", "true",
"LZCNT instructions are as fast as most simple integer ops">;
+// If the target can efficiently decode NOPs upto 7-bytes in length.
+def FeatureFast7ByteNOP
+ : SubtargetFeature<
+ "fast-7bytenop", "HasFast7ByteNOP", "true",
+ "Target can quickly decode up to 7 byte NOPs">;
// If the target can efficiently decode NOPs upto 11-bytes in length.
def FeatureFast11ByteNOP
: SubtargetFeature<
@@ -435,6 +455,15 @@ def FeatureLVIControlFlowIntegrity
"LFENCE instruction to serialize control flow. Also decompose RET "
"instructions into a POP+LFENCE+JMP sequence.">;
+// Enable SESES to mitigate speculative execution attacks
+def FeatureSpeculativeExecutionSideEffectSuppression
+ : SubtargetFeature<
+ "seses", "UseSpeculativeExecutionSideEffectSuppression", "true",
+ "Prevent speculative execution side channel timing attacks by "
+ "inserting a speculation barrier before memory reads, memory writes, "
+ "and conditional branches. Implies LVI Control Flow integrity.",
+ [FeatureLVIControlFlowIntegrity]>;
+
// Mitigate LVI attacks against data loads
def FeatureLVILoadHardening
: SubtargetFeature<
@@ -562,7 +591,8 @@ def ProcessorFeatures {
FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate,
- FeatureMergeToThreeWayBranch];
+ FeatureMergeToThreeWayBranch,
+ FeatureFast15ByteNOP];
list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> SNBInheritableFeatures =
@@ -744,6 +774,7 @@ def ProcessorFeatures {
list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
FeatureSlowDivide64,
FeatureSlowPMULLD,
+ FeatureFast7ByteNOP,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> SLMInheritableFeatures =
!listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
@@ -778,15 +809,13 @@ def ProcessorFeatures {
!listconcat(GLPInheritableFeatures, GLPSpecificFeatures);
// Tremont
- list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE,
- FeatureGFNI,
- FeatureMOVDIRI,
- FeatureMOVDIR64B,
- FeatureWAITPKG];
+ list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
+ FeatureGFNI];
list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
+ list<SubtargetFeature> TRMInheritableFeatures =
+ !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures);
list<SubtargetFeature> TRMFeatures =
- !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
- TRMSpecificFeatures);
+ !listconcat(TRMInheritableFeatures, TRMSpecificFeatures);
// Knights Landing
list<SubtargetFeature> KNLFeatures = [FeatureX87,
@@ -838,6 +867,7 @@ def ProcessorFeatures {
FeatureFXSR,
FeatureNOPL,
FeatureCMPXCHG16B,
+ FeaturePRFCHW,
FeatureLZCNT,
FeaturePOPCNT,
FeatureSlowSHLD,
@@ -933,6 +963,8 @@ def ProcessorFeatures {
// Excavator
list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
FeatureBMI2,
+ FeatureMOVBE,
+ FeatureRDRAND,
FeatureMWAITX];
list<SubtargetFeature> BdVer4InheritableFeatures =
!listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
@@ -993,7 +1025,7 @@ def ProcessorFeatures {
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
-// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
+// NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
// if i386/i486 is specifically requested.
def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
@@ -1256,6 +1288,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
FeatureNOPL,
Feature64Bit,
FeatureSlow3OpsLEA,
+ FeatureSlowDivide64,
FeatureSlowIncDec,
FeatureMacroFusion,
FeatureInsertVZEROUPPER
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 1ac291fcb887..aa03217d155d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -18,6 +18,7 @@
#include "TargetInfo/X86TargetInfo.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineConstantPool.h"
@@ -40,6 +41,8 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
using namespace llvm;
X86AsmPrinter::X86AsmPrinter(TargetMachine &TM,
@@ -76,7 +79,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
// Emit the rest of the function body.
- EmitFunctionBody();
+ emitFunctionBody();
// Emit the XRay table for this function.
emitXRayTable();
@@ -87,7 +90,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
return false;
}
-void X86AsmPrinter::EmitFunctionBodyStart() {
+void X86AsmPrinter::emitFunctionBodyStart() {
if (EmitFPOData) {
if (auto *XTS =
static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
@@ -97,7 +100,7 @@ void X86AsmPrinter::EmitFunctionBodyStart() {
}
}
-void X86AsmPrinter::EmitFunctionBodyEnd() {
+void X86AsmPrinter::emitFunctionBodyEnd() {
if (EmitFPOData) {
if (auto *XTS =
static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
@@ -124,7 +127,7 @@ void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
else
- GVSym = getSymbol(GV);
+ GVSym = getSymbolPreferLocal(*GV);
// Handle dllimport linkage.
if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
@@ -619,7 +622,7 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
return false;
}
-void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
if (TT.isOSBinFormatELF()) {
@@ -641,17 +644,17 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
// Emitting note header.
int WordSize = TT.isArch64Bit() ? 8 : 4;
- EmitAlignment(WordSize == 4 ? Align(4) : Align(8));
- OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0"
- OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
- OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
- OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name
+ emitAlignment(WordSize == 4 ? Align(4) : Align(8));
+ OutStreamer->emitIntValue(4, 4 /*size*/); // data size for "GNU\0"
+ OutStreamer->emitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
+ OutStreamer->emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
+ OutStreamer->emitBytes(StringRef("GNU", 4)); // note name
// Emitting an Elf_Prop for the CET properties.
- OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4);
- OutStreamer->EmitIntValue(4, 4); // data size
- OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data
- EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
+ OutStreamer->emitInt32(ELF::GNU_PROPERTY_X86_FEATURE_1_AND);
+ OutStreamer->emitInt32(4); // data size
+ OutStreamer->emitInt32(FeatureFlagsAnd); // data
+ emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
OutStreamer->endSection(Nt);
OutStreamer->SwitchSection(Cur);
@@ -683,30 +686,30 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
if (M.getModuleFlag("cfguard"))
Feat00Flags |= 0x800; // Object is CFG-aware.
- OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
- OutStreamer->EmitAssignment(
+ OutStreamer->emitSymbolAttribute(S, MCSA_Global);
+ OutStreamer->emitAssignment(
S, MCConstantExpr::create(Feat00Flags, MMI->getContext()));
}
- OutStreamer->EmitSyntaxDirective();
+ OutStreamer->emitSyntaxDirective();
// If this is not inline asm and we're in 16-bit
// mode prefix assembly with .code16.
bool is16 = TT.getEnvironment() == Triple::CODE16;
if (M.getModuleInlineAsm().empty() && is16)
- OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+ OutStreamer->emitAssemblerFlag(MCAF_Code16);
}
static void
emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
MachineModuleInfoImpl::StubValueTy &MCSym) {
// L_foo$stub:
- OutStreamer.EmitLabel(StubLabel);
+ OutStreamer.emitLabel(StubLabel);
// .indirect_symbol _foo
- OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+ OutStreamer.emitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
if (MCSym.getInt())
// External to current translation unit.
- OutStreamer.EmitIntValue(0, 4/*size*/);
+ OutStreamer.emitIntValue(0, 4/*size*/);
else
// Internal to current translation unit.
//
@@ -714,7 +717,7 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
// pointers need to be indirect and pc-rel. We accomplish this by
// using NLPs; however, sometimes the types are local to the file.
// We need to fill in the value for the NLP in those cases.
- OutStreamer.EmitValue(
+ OutStreamer.emitValue(
MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
4 /*size*/);
}
@@ -742,7 +745,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
}
}
-void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
const Triple &TT = TM.getTargetTriple();
if (TT.isOSBinFormatMachO()) {
@@ -759,7 +762,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
// points). If this doesn't occur, the linker can safely perform dead code
// stripping. Since LLVM never generates code that does this, it is always
// safe to set.
- OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+ OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
} else if (TT.isOSBinFormatCOFF()) {
if (MMI->usesMSVCFloatingPoint()) {
// In Windows' libcmt.lib, there is a file which is linked in only if the
@@ -778,7 +781,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
StringRef SymbolName =
(TT.getArch() == Triple::x86) ? "__fltused" : "_fltused";
MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
- OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+ OutStreamer->emitSymbolAttribute(S, MCSA_Global);
return;
}
emitStackMaps(SM);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
index ee79401dc80d..eb485fa2ecef 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -9,12 +9,9 @@
#ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
#define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
-#include "X86Subtarget.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/FaultMaps.h"
#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Target/TargetMachine.h"
// Implemented in X86MCInstLower.cpp
namespace {
@@ -22,8 +19,10 @@ namespace {
}
namespace llvm {
+class MCCodeEmitter;
class MCStreamer;
-class MCSymbol;
+class X86Subtarget;
+class TargetMachine;
class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
const X86Subtarget *Subtarget = nullptr;
@@ -123,14 +122,14 @@ public:
const X86Subtarget &getSubtarget() const { return *Subtarget; }
- void EmitStartOfAsmFile(Module &M) override;
+ void emitStartOfAsmFile(Module &M) override;
- void EmitEndOfAsmFile(Module &M) override;
+ void emitEndOfAsmFile(Module &M) override;
- void EmitInstruction(const MachineInstr *MI) override;
+ void emitInstruction(const MachineInstr *MI) override;
- void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
- AsmPrinter::EmitBasicBlockEnd(MBB);
+ void emitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+ AsmPrinter::emitBasicBlockEnd(MBB);
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
}
@@ -147,8 +146,8 @@ public:
}
bool runOnMachineFunction(MachineFunction &F) override;
- void EmitFunctionBodyStart() override;
- void EmitFunctionBodyEnd() override;
+ void emitFunctionBodyStart() override;
+ void emitFunctionBodyEnd() override;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 0f1d4b51062e..9f1fece1b9dd 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -1,4 +1,4 @@
-//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
+//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -33,6 +33,7 @@
// transformation done here is correct regardless to other memory accesses.
//===----------------------------------------------------------------------===//
+#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/Analysis/AliasAnalysis.h"
@@ -287,7 +288,7 @@ static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
return 0;
}
-static int getAddrOffset(MachineInstr *MI) {
+static int getAddrOffset(const MachineInstr *MI) {
const MCInstrDesc &Descl = MI->getDesc();
int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
assert(AddrOffset != -1 && "Expected Memory Operand");
@@ -310,11 +311,11 @@ static MachineOperand &getDispOperand(MachineInstr *MI) {
// TODO: Consider expanding to other addressing modes in the future
static bool isRelevantAddressingMode(MachineInstr *MI) {
int AddrOffset = getAddrOffset(MI);
- MachineOperand &Base = getBaseOperand(MI);
- MachineOperand &Disp = getDispOperand(MI);
- MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
- MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
- MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
+ const MachineOperand &Base = getBaseOperand(MI);
+ const MachineOperand &Disp = getDispOperand(MI);
+ const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
+ const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+ const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
return false;
@@ -410,9 +411,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
// If the load and store are consecutive, use the loadInst location to
// reduce register pressure.
MachineInstr *StInst = StoreInst;
- auto PrevInstrIt = skipDebugInstructionsBackward(
- std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
- MBB->instr_begin());
+ auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
+ MBB->instr_begin());
if (PrevInstrIt.getNodePtr() == LoadInst)
StInst = LoadInst;
MachineInstr *NewStore =
@@ -498,9 +498,10 @@ void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
MachineOperand &LoadBase = getBaseOperand(LoadInst);
MachineOperand &StoreBase = getBaseOperand(StoreInst);
- auto StorePrevNonDbgInstr = skipDebugInstructionsBackward(
- std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
- LoadInst->getParent()->instr_begin()).getNodePtr();
+ auto *StorePrevNonDbgInstr =
+ prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
+ LoadInst->getParent()->instr_begin())
+ .getNodePtr();
if (LoadBase.isReg()) {
MachineInstr *LastLoad = LoadInst->getPrevNode();
// If the original load and store to xmm/ymm were consecutive
@@ -550,11 +551,8 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
if (StoreMI.getParent() == MI.getParent() &&
isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
isRelevantAddressingMode(&MI) &&
- isRelevantAddressingMode(&StoreMI)) {
- assert(MI.hasOneMemOperand() &&
- "Expected one memory operand for load instruction");
- assert(StoreMI.hasOneMemOperand() &&
- "Expected one memory operand for store instruction");
+ isRelevantAddressingMode(&StoreMI) &&
+ MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
}
@@ -563,7 +561,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
}
unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
- auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
+ const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
*LoadInst->getParent()->getParent());
return TRI->getRegSizeInBits(*TRC) / 8;
}
@@ -616,8 +614,8 @@ void X86AvoidSFBPass::breakBlockedCopies(
static bool hasSameBaseOpValue(MachineInstr *LoadInst,
MachineInstr *StoreInst) {
- MachineOperand &LoadBase = getBaseOperand(LoadInst);
- MachineOperand &StoreBase = getBaseOperand(StoreInst);
+ const MachineOperand &LoadBase = getBaseOperand(LoadInst);
+ const MachineOperand &StoreBase = getBaseOperand(StoreInst);
if (LoadBase.isReg() != StoreBase.isReg())
return false;
if (LoadBase.isReg())
@@ -691,13 +689,12 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
SmallVector<MachineInstr *, 2> PotentialBlockers =
findPotentialBlockers(LoadInst);
- for (auto PBInst : PotentialBlockers) {
+ for (auto *PBInst : PotentialBlockers) {
if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
LoadInst->getOpcode()) ||
- !isRelevantAddressingMode(PBInst))
+ !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
continue;
int64_t PBstDispImm = getDispOperand(PBInst).getImm();
- assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
// This check doesn't cover all cases, but it will suffice for now.
// TODO: take branch probability into consideration, if the blocking
@@ -727,7 +724,7 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
ForRemoval.push_back(LoadInst);
ForRemoval.push_back(StoreInst);
}
- for (auto RemovedInst : ForRemoval) {
+ for (auto *RemovedInst : ForRemoval) {
RemovedInst->eraseFromParent();
}
ForRemoval.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
index fb4f9e2901dc..0899783d5f60 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -6,10 +6,29 @@
//
//===----------------------------------------------------------------------===//
//
-// The Windows x64 unwinder has trouble unwinding the stack when a return
-// address points to the end of the function. This pass maintains the invariant
-// that every return address is inside the bounds of its parent function or
-// funclet by inserting int3 if the last instruction would otherwise be a call.
+// The Windows x64 unwinder decodes the instruction stream during unwinding.
+// The unwinder decodes forward from the current PC to detect epilogue code
+// patterns.
+//
+// First, this means that there must be an instruction after every
+// call instruction for the unwinder to decode. LLVM must maintain the invariant
+// that the last instruction of a function or funclet is not a call, or the
+// unwinder may decode into the next function. Similarly, a call may not
+// immediately precede an epilogue code pattern. As of this writing, the
+// SEH_Epilogue pseudo instruction takes care of that.
+//
+// Second, all non-tail call jump targets must be within the *half-open*
+// interval of the bounds of the function. The unwinder distinguishes between
+// internal jump instructions and tail calls in an epilogue sequence by checking
+// the jump target against the function bounds from the .pdata section. This
+// means that the last regular MBB of an LLVM function must not be empty if
+// there are regular jumps targeting it.
+//
+// This pass upholds these invariants by ensuring that blocks at the end of a
+// function or funclet are a) not empty and b) do not end in a CALL instruction.
+//
+// Unwinder implementation for reference:
+// https://github.com/dotnet/coreclr/blob/a9f3fc16483eecfc47fb79c362811d870be02249/src/unwinder/amd64/unwinder_amd64.cpp#L1015
//
//===----------------------------------------------------------------------===//
@@ -18,33 +37,35 @@
#include "X86Subtarget.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#define DEBUG_TYPE "x86-avoid-trailing-call"
+#define AVOIDCALL_DESC "X86 avoid trailing call pass"
+#define AVOIDCALL_NAME "x86-avoid-trailing-call"
+
+#define DEBUG_TYPE AVOIDCALL_NAME
using namespace llvm;
namespace {
-
class X86AvoidTrailingCallPass : public MachineFunctionPass {
public:
X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
-private:
- StringRef getPassName() const override {
- return "X86 avoid trailing call pass";
- }
static char ID;
+
+private:
+ StringRef getPassName() const override { return AVOIDCALL_DESC; }
};
+} // end anonymous namespace
char X86AvoidTrailingCallPass::ID = 0;
-} // end anonymous namespace
-
FunctionPass *llvm::createX86AvoidTrailingCallPass() {
return new X86AvoidTrailingCallPass();
}
+INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false)
+
// A real instruction is a non-meta, non-pseudo instruction. Some pseudos
// expand to nothing, and some expand to code. This logic conservatively assumes
// they might expand to nothing.
@@ -62,6 +83,11 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
const X86InstrInfo &TII = *STI.getInstrInfo();
assert(STI.isTargetWin64() && "pass only runs on Win64");
+ // We don't need to worry about any of the invariants described above if there
+ // is no unwind info (CFI).
+ if (!MF.hasWinCFI())
+ return false;
+
// FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops
// before epilogues.
@@ -73,33 +99,34 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
if (NextMBB && !NextMBB->isEHFuncletEntry())
continue;
- // Find the last real instruction in this block, or previous blocks if this
- // block is empty.
- MachineBasicBlock::reverse_iterator LastRealInstr;
- for (MachineBasicBlock &RMBB :
- make_range(MBB.getReverseIterator(), MF.rend())) {
- LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction);
- if (LastRealInstr != RMBB.rend())
- break;
- }
-
- // Do nothing if this function or funclet has no instructions.
- if (LastRealInstr == MF.begin()->rend())
- continue;
+ // Find the last real instruction in this block.
+ auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction);
- // If this is a call instruction, insert int3 right after it with the same
- // DebugLoc. Convert back to a forward iterator and advance the insertion
- // position once.
- if (isCallInstruction(*LastRealInstr)) {
+ // If the block is empty or the last real instruction is a call instruction,
+ // insert an int3. If there is a call instruction, insert the int3 between
+ // the call and any labels or other meta instructions. If the block is
+ // empty, insert at block end.
+ bool IsEmpty = LastRealInstr == MBB.rend();
+ bool IsCall = !IsEmpty && isCallInstruction(*LastRealInstr);
+ if (IsEmpty || IsCall) {
LLVM_DEBUG({
- dbgs() << "inserting int3 after trailing call instruction:\n";
- LastRealInstr->dump();
- dbgs() << '\n';
+ if (IsCall) {
+ dbgs() << "inserting int3 after trailing call instruction:\n";
+ LastRealInstr->dump();
+ dbgs() << '\n';
+ } else {
+ dbgs() << "inserting int3 in trailing empty MBB:\n";
+ MBB.dump();
+ }
});
- MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse());
- BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(),
- TII.get(X86::INT3));
+ MachineBasicBlock::iterator MBBI = MBB.end();
+ DebugLoc DL;
+ if (IsCall) {
+ MBBI = std::next(LastRealInstr.getReverse());
+ DL = LastRealInstr->getDebugLoc();
+ }
+ BuildMI(MBB, MBBI, DL, TII.get(X86::INT3));
Changed = true;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index f8faa572dffc..caa1f7952475 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -17,6 +17,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
#include "X86FrameLowering.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
@@ -162,14 +163,13 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
// memory for arguments.
unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
- bool UseStackProbe =
- !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty();
+ bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF);
unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
for (MachineBasicBlock &BB : MF) {
bool InsideFrameSequence = false;
for (MachineInstr &MI : BB) {
if (MI.getOpcode() == FrameSetupOpcode) {
- if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe)
+ if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall)
return false;
if (InsideFrameSequence)
return false;
@@ -199,7 +199,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
if (CannotReserveFrame)
return true;
- unsigned StackAlign = TFL->getStackAlignment();
+ Align StackAlign = TFL->getStackAlign();
int64_t Advantage = 0;
for (auto CC : CallSeqVector) {
@@ -222,7 +222,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
// We'll need a add after the call.
Advantage -= 3;
// If we have to realign the stack, we'll also need a sub before
- if (CC.ExpectedDist % StackAlign)
+ if (!isAligned(StackAlign, CC.ExpectedDist))
Advantage -= 3;
// Now, for each push, we save ~3 bytes. For small constants, we actually,
// save more (up to 5 bytes), but 3 should be a good approximation.
@@ -531,6 +531,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
}
Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);
+ Push->cloneMemRefs(MF, *Store);
break;
case X86::MOV32mr:
case X86::MOV64mr: {
@@ -550,7 +551,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
// If PUSHrmm is not slow on this target, try to fold the source of the
// push into the instruction.
- bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
+ bool SlowPUSHrmm = STI->slowTwoMemOps();
// Check that this is legal to fold. Right now, we're extremely
// conservative about that.
@@ -562,6 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
unsigned NumOps = DefMov->getDesc().getNumOperands();
for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
Push->addOperand(DefMov->getOperand(i));
+ Push->cloneMergedMemRefs(MF, {&*DefMov, &*Store});
DefMov->eraseFromParent();
} else {
@@ -569,6 +571,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
.addReg(Reg)
.getInstr();
+ Push->cloneMemRefs(MF, *Store);
}
break;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
index 57bf799cf89c..319dc9470604 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -108,17 +108,15 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MachinePointerInfo &MPO) override {
LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
- Register SPReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister());
+ auto SPReg =
+ MIRBuilder.buildCopy(p0, STI.getRegisterInfo()->getStackRegister());
- Register OffsetReg = MRI.createGenericVirtualRegister(SType);
- MIRBuilder.buildConstant(OffsetReg, Offset);
+ auto OffsetReg = MIRBuilder.buildConstant(SType, Offset);
- Register AddrReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
+ auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
- return AddrReg;
+ return AddrReg.getReg(0);
}
void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -139,7 +137,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
if (PhysRegSize > ValSize && LocSize == ValSize) {
assert((PhysRegSize == 128 || PhysRegSize == 80) && "We expect that to be 128 bit");
auto MIB = MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg);
- ExtReg = MIB->getOperand(0).getReg();
+ ExtReg = MIB.getReg(0);
} else
ExtReg = extendRegister(ValVReg, VA);
@@ -148,10 +146,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
+ MachineFunction &MF = MIRBuilder.getMF();
Register ExtReg = extendRegister(ValVReg, VA);
- auto MMO = MIRBuilder.getMF().getMachineMemOperand(
- MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
- /* Alignment */ 1);
+
+ auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
+ VA.getLocVT().getStoreSize(),
+ inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildStore(ExtReg, Addr, *MMO);
}
@@ -240,17 +240,17 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
- Register AddrReg = MRI.createGenericVirtualRegister(
- LLT::pointer(0, DL.getPointerSizeInBits(0)));
- MIRBuilder.buildFrameIndex(AddrReg, FI);
- return AddrReg;
+ return MIRBuilder
+ .buildFrameIndex(LLT::pointer(0, DL.getPointerSizeInBits(0)), FI)
+ .getReg(0);
}
void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MachineFunction &MF = MIRBuilder.getMF();
+ auto MMO = MF.getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
- 1);
+ inferAlignFromPtrInfo(MF, MPO));
MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
index 444a0c7d0122..b5ea7782896b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallLowering.h
@@ -14,12 +14,12 @@
#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
#define LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
-#include "llvm/ADT/ArrayRef.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include <functional>
namespace llvm {
+template <typename T> class ArrayRef;
class DataLayout;
class MachineRegisterInfo;
class X86TargetLowering;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
index aee344a26764..c899db60e016 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -60,7 +60,7 @@ static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT,
State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
}
- // Successful in allocating regsiters - stop scanning next rules.
+ // Successful in allocating registers - stop scanning next rules.
return true;
}
@@ -166,7 +166,7 @@ static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
State.getMachineFunction().getSubtarget().getRegisterInfo();
if (TRI->regsOverlap(Reg, X86::XMM4) ||
TRI->regsOverlap(Reg, X86::XMM5))
- State.AllocateStack(8, 8);
+ State.AllocateStack(8, Align(8));
if (!ArgFlags.isHva()) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -281,7 +281,7 @@ static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
if (UseRegs)
It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
else
- It.convertToMem(State.AllocateStack(4, 4));
+ It.convertToMem(State.AllocateStack(4, Align(4)));
State.addLoc(It);
}
@@ -305,7 +305,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
if (ArgCount == 1 && ValNo == 0) {
// If we have one argument, the argument is five stack slots big, at fixed
// offset zero.
- Offset = State.AllocateStack(5 * SlotSize, 4);
+ Offset = State.AllocateStack(5 * SlotSize, Align(4));
} else if (ArgCount == 2 && ValNo == 0) {
// If we have two arguments, the stack slot is *after* the error code
// argument. Pretend it doesn't consume stack space, and account for it when
@@ -316,7 +316,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
// appears first on the stack, and is then followed by the five slot
// interrupt struct.
Offset = 0;
- (void)State.AllocateStack(6 * SlotSize, 4);
+ (void)State.AllocateStack(6 * SlotSize, Align(4));
} else {
report_fatal_error("unsupported x86 interrupt prototype");
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
index db1aef2fd09d..802e694999b6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CallingConv.td
@@ -789,8 +789,9 @@ def CC_X86_32_Vector_Darwin : CallingConv<[
/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
/// values are spilled on the stack.
def CC_X86_32_Common : CallingConv<[
- // Handles byval parameters.
+ // Handles byval/preallocated parameters.
CCIfByVal<CCPassByVal<4, 4>>,
+ CCIfPreallocated<CCPassByVal<4, 4>>,
// The first 3 float or double arguments, if marked 'inreg' and if the call
// is not a vararg call and if SSE2 is available, are passed in SSE registers.
@@ -1145,7 +1146,7 @@ def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
(sequence "YMM%u", 8, 15))>;
-def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15,
(sequence "ZMM%u", 16, 31),
K4, K5, K6, K7)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
index 7051550d52e6..2ff8ee19561b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
static cl::opt<bool> EnableDiscriminateMemops(
DEBUG_TYPE, cl::init(false),
cl::desc("Generate unique debug info for each instruction with a memory "
- "operand. Should be enabled for profile-drived cache prefetching, "
+ "operand. Should be enabled for profile-driven cache prefetching, "
"both in the build of the binary being profiled, as well as in "
"the build of the binary consuming the profile."),
cl::Hidden);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 438b9fd8eebb..488ee51f1d89 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -283,7 +283,7 @@ public:
// A converter is identified by <destination domain, source opcode>
typedef std::pair<int, unsigned> InstrConverterBaseKeyTy;
-typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *>
+typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>>
InstrConverterBaseMap;
/// A closure is a set of virtual register representing all of the edges in
@@ -471,8 +471,8 @@ void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {
// instruction.
for (int i = 0; i != NumDomains; ++i) {
if (C.isLegal((RegDomain)i)) {
- InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()});
- if (!IC || !IC->isLegal(MI, TII))
+ auto I = Converters.find({i, MI->getOpcode()});
+ if (I == Converters.end() || !I->second->isLegal(MI, TII))
C.setIllegal((RegDomain)i);
}
}
@@ -484,8 +484,8 @@ double X86DomainReassignment::calculateCost(const Closure &C,
double Cost = 0.0;
for (auto *MI : C.instructions())
- Cost +=
- Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI);
+ Cost += Converters.find({DstDomain, MI->getOpcode()})
+ ->second->getExtraCost(MI, MRI);
return Cost;
}
@@ -501,8 +501,8 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
// appropriate converter.
SmallVector<MachineInstr *, 8> ToErase;
for (auto *MI : C.instructions())
- if (Converters.lookup({Domain, MI->getOpcode()})
- ->convertInstr(MI, TII, MRI))
+ if (Converters.find({Domain, MI->getOpcode()})
+ ->second->convertInstr(MI, TII, MRI))
ToErase.push_back(MI);
// Iterate all registers in the closure, replace them with registers in the
@@ -606,19 +606,21 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
void X86DomainReassignment::initConverters() {
Converters[{MaskDomain, TargetOpcode::PHI}] =
- new InstrIgnore(TargetOpcode::PHI);
+ std::make_unique<InstrIgnore>(TargetOpcode::PHI);
Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] =
- new InstrIgnore(TargetOpcode::IMPLICIT_DEF);
+ std::make_unique<InstrIgnore>(TargetOpcode::IMPLICIT_DEF);
Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] =
- new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2);
+ std::make_unique<InstrReplaceWithCopy>(TargetOpcode::INSERT_SUBREG, 2);
Converters[{MaskDomain, TargetOpcode::COPY}] =
- new InstrCOPYReplacer(TargetOpcode::COPY, MaskDomain, TargetOpcode::COPY);
+ std::make_unique<InstrCOPYReplacer>(TargetOpcode::COPY, MaskDomain,
+ TargetOpcode::COPY);
auto createReplacerDstCOPY = [&](unsigned From, unsigned To) {
- Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(From, To);
+ Converters[{MaskDomain, From}] =
+ std::make_unique<InstrReplacerDstCOPY>(From, To);
};
createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm);
@@ -638,7 +640,7 @@ void X86DomainReassignment::initConverters() {
}
auto createReplacer = [&](unsigned From, unsigned To) {
- Converters[{MaskDomain, From}] = new InstrReplacer(From, To);
+ Converters[{MaskDomain, From}] = std::make_unique<InstrReplacer>(From, To);
};
createReplacer(X86::MOV16rm, X86::KMOVWkm);
@@ -779,8 +781,6 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
}
}
- DeleteContainerSeconds(Converters);
-
LLVM_DEBUG(
dbgs() << "***** Machine Function after Domain Reassignment *****\n");
LLVM_DEBUG(MF.print(dbgs()));
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
index f1cf9b94c9e5..540ad98b6d54 100755
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -237,11 +237,9 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
// Make sure the tables are sorted.
static std::atomic<bool> TableChecked(false);
if (!TableChecked.load(std::memory_order_relaxed)) {
- assert(std::is_sorted(std::begin(X86EvexToVex128CompressTable),
- std::end(X86EvexToVex128CompressTable)) &&
+ assert(llvm::is_sorted(X86EvexToVex128CompressTable) &&
"X86EvexToVex128CompressTable is not sorted!");
- assert(std::is_sorted(std::begin(X86EvexToVex256CompressTable),
- std::end(X86EvexToVex256CompressTable)) &&
+ assert(llvm::is_sorted(X86EvexToVex256CompressTable) &&
"X86EvexToVex256CompressTable is not sorted!");
TableChecked.store(true, std::memory_order_relaxed);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index d35d65914b34..c47ef4708e91 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -275,7 +275,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineInstr &NewMI = *std::prev(MBBI);
NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
- MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
+
+ // Update the call site info.
+ if (MBBI->isCandidateForCallSiteEntry())
+ MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
@@ -331,14 +334,6 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MBB.erase(MBBI);
return true;
}
- case X86::EH_RESTORE: {
- // Restore ESP and EBP, and optionally ESI if required.
- bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
- MBB.getParent()->getFunction().getPersonalityFn()));
- X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
- MBBI->eraseFromParent();
- return true;
- }
case X86::LCMPXCHG8B_SAVE_EBX:
case X86::LCMPXCHG16B_SAVE_RBX: {
// Perform the following transformation.
@@ -371,6 +366,82 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MBBI->eraseFromParent();
return true;
}
+ // Loading/storing mask pairs requires two kmov operations. The second one of
+ // these needs a 2 byte displacement relative to the specified address (with
+ // 32 bit spill size). The pairs of 1bit masks up to 16 bit masks all use the
+ // same spill size, they all are stored using MASKPAIR16STORE, loaded using
+ // MASKPAIR16LOAD.
+ //
+ // The displacement value might wrap around in theory, thus the asserts in
+ // both cases.
+ case X86::MASKPAIR16LOAD: {
+ int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ Register Reg = MBBI->getOperand(0).getReg();
+ bool DstIsDead = MBBI->getOperand(0).isDead();
+ Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1);
+
+ auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm))
+ .addReg(Reg0, RegState::Define | getDeadRegState(DstIsDead));
+ auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm))
+ .addReg(Reg1, RegState::Define | getDeadRegState(DstIsDead));
+
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MIBLo.add(MBBI->getOperand(1 + i));
+ if (i == X86::AddrDisp)
+ MIBHi.addImm(Disp + 2);
+ else
+ MIBHi.add(MBBI->getOperand(1 + i));
+ }
+
+ // Split the memory operand, adjusting the offset and size for the halves.
+ MachineMemOperand *OldMMO = MBBI->memoperands().front();
+ MachineFunction *MF = MBB.getParent();
+ MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2);
+ MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2);
+
+ MIBLo.setMemRefs(MMOLo);
+ MIBHi.setMemRefs(MMOHi);
+
+ // Delete the pseudo.
+ MBB.erase(MBBI);
+ return true;
+ }
+ case X86::MASKPAIR16STORE: {
+ int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
+ assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+ Register Reg = MBBI->getOperand(X86::AddrNumOperands).getReg();
+ bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
+ Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0);
+ Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1);
+
+ auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk));
+ auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk));
+
+ for (int i = 0; i < X86::AddrNumOperands; ++i) {
+ MIBLo.add(MBBI->getOperand(i));
+ if (i == X86::AddrDisp)
+ MIBHi.addImm(Disp + 2);
+ else
+ MIBHi.add(MBBI->getOperand(i));
+ }
+ MIBLo.addReg(Reg0, getKillRegState(SrcIsKill));
+ MIBHi.addReg(Reg1, getKillRegState(SrcIsKill));
+
+ // Split the memory operand, adjusting the offset and size for the halves.
+ MachineMemOperand *OldMMO = MBBI->memoperands().front();
+ MachineFunction *MF = MBB.getParent();
+ MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2);
+ MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2);
+
+ MIBLo.setMemRefs(MMOLo);
+ MIBHi.setMemRefs(MMOHi);
+
+ // Delete the pseudo.
+ MBB.erase(MBBI);
+ return true;
+ }
case TargetOpcode::ICALL_BRANCH_FUNNEL:
ExpandICallBranchFunnel(&MBB, MBBI);
return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
index a1d256ea872d..b305940139c0 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp
@@ -26,7 +26,6 @@
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DerivedTypes.h"
@@ -498,7 +497,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
default: return false;
case MVT::i1: {
// Mask out all but lowest bit.
- unsigned AndResult = createResultReg(&X86::GR8RegClass);
+ Register AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(X86::AND8ri), AndResult)
.addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
@@ -691,7 +690,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
}
}
- unsigned ValReg = getRegForValue(Val);
+ Register ValReg = getRegForValue(Val);
if (ValReg == 0)
return false;
@@ -761,9 +760,9 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
// Ok, we need to do a load from a stub. If we've already loaded from
// this stub, reuse the loaded pointer, otherwise emit the load now.
- DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
- unsigned LoadReg;
- if (I != LocalValueMap.end() && I->second != 0) {
+ DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V);
+ Register LoadReg;
+ if (I != LocalValueMap.end() && I->second) {
LoadReg = I->second;
} else {
// Issue load from stub.
@@ -1128,10 +1127,8 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
return false;
- unsigned Alignment = S->getAlignment();
- unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
- if (Alignment == 0) // Ensure that codegen never sees alignment 0
- Alignment = ABIAlignment;
+ Align Alignment = S->getAlign();
+ Align ABIAlignment = DL.getABITypeAlign(Val->getType());
bool Aligned = Alignment >= ABIAlignment;
X86AddressMode AM;
@@ -1196,7 +1193,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
const Value *RV = Ret->getOperand(0);
- unsigned Reg = getRegForValue(RV);
+ Register Reg = getRegForValue(RV);
if (Reg == 0)
return false;
@@ -1264,7 +1261,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
// We saved the argument into a virtual register in the entry block,
// so now we copy the value out and into %rax/%eax.
if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
- unsigned Reg = X86MFInfo->getSRetReturnReg();
+ Register Reg = X86MFInfo->getSRetReturnReg();
assert(Reg &&
"SRetReturnReg should have been set in LowerFormalArguments()!");
unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
@@ -1322,14 +1319,9 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
if (!X86SelectAddress(Ptr, AM))
return false;
- unsigned Alignment = LI->getAlignment();
- unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
- if (Alignment == 0) // Ensure that codegen never sees alignment 0
- Alignment = ABIAlignment;
-
unsigned ResultReg = 0;
if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
- Alignment))
+ LI->getAlign().value()))
return false;
updateValueMap(I, ResultReg);
@@ -1392,7 +1384,7 @@ static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
const DebugLoc &CurDbgLoc) {
- unsigned Op0Reg = getRegForValue(Op0);
+ Register Op0Reg = getRegForValue(Op0);
if (Op0Reg == 0) return false;
// Handle 'null' like i32/i64 0.
@@ -1414,7 +1406,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
if (CompareOpc == 0) return false;
- unsigned Op1Reg = getRegForValue(Op1);
+ Register Op1Reg = getRegForValue(Op1);
if (Op1Reg == 0) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
.addReg(Op0Reg)
@@ -1487,8 +1479,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
return false;
- unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
- unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ Register FlagReg1 = createResultReg(&X86::GR8RegClass);
+ Register FlagReg2 = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
FlagReg1).addImm(SETFOpc[0]);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
@@ -1522,7 +1514,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
if (!TLI.isTypeLegal(DstVT))
return false;
- unsigned ResultReg = getRegForValue(I->getOperand(0));
+ Register ResultReg = getRegForValue(I->getOperand(0));
if (ResultReg == 0)
return false;
@@ -1548,7 +1540,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
default: llvm_unreachable("Unexpected zext to i64 source type");
}
- unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ Register Result32 = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
.addReg(ResultReg);
@@ -1559,7 +1551,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
} else if (DstVT == MVT::i16) {
// i8->i16 doesn't exist in the autogenerated isel table. Need to zero
// extend to 32-bits and then extract down to 16-bits.
- unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ Register Result32 = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
Result32).addReg(ResultReg);
@@ -1581,7 +1573,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
if (!TLI.isTypeLegal(DstVT))
return false;
- unsigned ResultReg = getRegForValue(I->getOperand(0));
+ Register ResultReg = getRegForValue(I->getOperand(0));
if (ResultReg == 0)
return false;
@@ -1589,7 +1581,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
if (SrcVT == MVT::i1) {
// Set the high bits to zero.
- unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
+ Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
/*TODO: Kill=*/false);
if (ZExtReg == 0)
return false;
@@ -1605,7 +1597,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
if (DstVT == MVT::i16) {
// i8->i16 doesn't exist in the autogenerated isel table. Need to sign
// extend to 32-bits and then extract down to 16-bits.
- unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ Register Result32 = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
Result32).addReg(ResultReg);
@@ -1720,7 +1712,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
case MVT::i64: TestOpc = X86::TEST64ri32; break;
}
if (TestOpc) {
- unsigned OpReg = getRegForValue(TI->getOperand(0));
+ Register OpReg = getRegForValue(TI->getOperand(0));
if (OpReg == 0) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
@@ -1742,7 +1734,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
} else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
// Fake request the condition, otherwise the intrinsic might be completely
// optimized away.
- unsigned TmpReg = getRegForValue(BI->getCondition());
+ Register TmpReg = getRegForValue(BI->getCondition());
if (TmpReg == 0)
return false;
@@ -1755,7 +1747,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
// Otherwise do a clumsy setcc and re-test it.
// Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
// in an explicit cast, so make sure to handle that correctly.
- unsigned OpReg = getRegForValue(BI->getCondition());
+ Register OpReg = getRegForValue(BI->getCondition());
if (OpReg == 0) return false;
// In case OpReg is a K register, COPY to a GPR
@@ -1824,10 +1816,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
if (!isTypeLegal(I->getType(), VT))
return false;
- unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ Register Op0Reg = getRegForValue(I->getOperand(0));
if (Op0Reg == 0) return false;
- unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ Register Op1Reg = getRegForValue(I->getOperand(1));
if (Op1Reg == 0) return false;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
CReg).addReg(Op1Reg);
@@ -1839,7 +1831,7 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
TII.get(TargetOpcode::KILL), X86::CL)
.addReg(CReg, RegState::Kill);
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
.addReg(Op0Reg);
updateValueMap(I, ResultReg);
@@ -1933,10 +1925,10 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
const DivRemEntry &TypeEntry = OpTable[TypeIndex];
const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
- unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ Register Op0Reg = getRegForValue(I->getOperand(0));
if (Op0Reg == 0)
return false;
- unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ Register Op1Reg = getRegForValue(I->getOperand(1));
if (Op1Reg == 0)
return false;
@@ -1949,7 +1941,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(OpEntry.OpSignExtend));
else {
- unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+ Register Zero32 = createResultReg(&X86::GR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(X86::MOV32r0), Zero32);
@@ -1986,8 +1978,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
if ((I->getOpcode() == Instruction::SRem ||
I->getOpcode() == Instruction::URem) &&
OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
- unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
- unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+ Register SourceSuperReg = createResultReg(&X86::GR16RegClass);
+ Register ResultSuperReg = createResultReg(&X86::GR16RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Copy), SourceSuperReg).addReg(X86::AX);
@@ -2066,15 +2058,15 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
return false;
if (SETFOpc) {
- unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
- unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+ Register FlagReg1 = createResultReg(&X86::GR8RegClass);
+ Register FlagReg2 = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
FlagReg1).addImm(SETFOpc[0]);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
FlagReg2).addImm(SETFOpc[1]);
auto const &II = TII.get(SETFOpc[2]);
if (II.getNumDefs()) {
- unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+ Register TmpReg = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
.addReg(FlagReg2).addReg(FlagReg1);
} else {
@@ -2086,7 +2078,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
} else if (foldX86XALUIntrinsic(CC, I, Cond)) {
// Fake request the condition, otherwise the intrinsic might be completely
// optimized away.
- unsigned TmpReg = getRegForValue(Cond);
+ Register TmpReg = getRegForValue(Cond);
if (TmpReg == 0)
return false;
@@ -2099,7 +2091,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
// accurate. If we read more than the lsb, we may see non-zero values
// whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
// the select. This is achieved by performing TEST against 1.
- unsigned CondReg = getRegForValue(Cond);
+ Register CondReg = getRegForValue(Cond);
if (CondReg == 0)
return false;
bool CondIsKill = hasTrivialKill(Cond);
@@ -2122,10 +2114,10 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
- unsigned RHSReg = getRegForValue(RHS);
+ Register RHSReg = getRegForValue(RHS);
bool RHSIsKill = hasTrivialKill(RHS);
- unsigned LHSReg = getRegForValue(LHS);
+ Register LHSReg = getRegForValue(LHS);
bool LHSIsKill = hasTrivialKill(LHS);
if (!LHSReg || !RHSReg)
@@ -2133,7 +2125,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
- unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
+ Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
LHSReg, LHSIsKill, CC);
updateValueMap(I, ResultReg);
return true;
@@ -2182,19 +2174,19 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
- unsigned LHSReg = getRegForValue(LHS);
+ Register LHSReg = getRegForValue(LHS);
bool LHSIsKill = hasTrivialKill(LHS);
- unsigned RHSReg = getRegForValue(RHS);
+ Register RHSReg = getRegForValue(RHS);
bool RHSIsKill = hasTrivialKill(RHS);
- unsigned CmpLHSReg = getRegForValue(CmpLHS);
+ Register CmpLHSReg = getRegForValue(CmpLHS);
bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
- unsigned CmpRHSReg = getRegForValue(CmpRHS);
+ Register CmpRHSReg = getRegForValue(CmpRHS);
bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
- if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+ if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)
return false;
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
@@ -2207,12 +2199,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
unsigned CmpOpcode =
(RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
- unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
+ Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
// Need an IMPLICIT_DEF for the input that is used to generate the upper
// bits of the result register since its not based on any of the inputs.
- unsigned ImplicitDefReg = createResultReg(VR128X);
+ Register ImplicitDefReg = createResultReg(VR128X);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
@@ -2241,9 +2233,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
unsigned BlendOpcode =
(RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
- unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+ Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
- unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+ Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
LHSReg, LHSIsKill, CmpReg, true);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2263,13 +2255,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
}
const TargetRegisterClass *VR128 = &X86::VR128RegClass;
- unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+ Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
CmpRHSReg, CmpRHSIsKill, CC);
- unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
+ Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
LHSReg, LHSIsKill);
- unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
+ Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
RHSReg, RHSIsKill);
- unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
+ Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
AndReg, /*IsKill=*/true);
ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2317,7 +2309,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
return false;
} else {
- unsigned CondReg = getRegForValue(Cond);
+ Register CondReg = getRegForValue(Cond);
if (CondReg == 0)
return false;
bool CondIsKill = hasTrivialKill(Cond);
@@ -2340,10 +2332,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
const Value *LHS = I->getOperand(1);
const Value *RHS = I->getOperand(2);
- unsigned LHSReg = getRegForValue(LHS);
+ Register LHSReg = getRegForValue(LHS);
bool LHSIsKill = hasTrivialKill(LHS);
- unsigned RHSReg = getRegForValue(RHS);
+ Register RHSReg = getRegForValue(RHS);
bool RHSIsKill = hasTrivialKill(RHS);
if (!LHSReg || !RHSReg)
@@ -2351,7 +2343,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
- unsigned ResultReg =
+ Register ResultReg =
fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
updateValueMap(I, ResultReg);
return true;
@@ -2373,12 +2365,12 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
}
// No need for a select anymore - this is an unconditional move.
if (Opnd) {
- unsigned OpReg = getRegForValue(Opnd);
+ Register OpReg = getRegForValue(Opnd);
if (OpReg == 0)
return false;
bool OpIsKill = hasTrivialKill(Opnd);
const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(OpReg, getKillRegState(OpIsKill));
@@ -2419,7 +2411,7 @@ bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
return false;
// Select integer to float/double conversion.
- unsigned OpReg = getRegForValue(I->getOperand(0));
+ Register OpReg = getRegForValue(I->getOperand(0));
if (OpReg == 0)
return false;
@@ -2448,10 +2440,10 @@ bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
- unsigned ImplicitDefReg = createResultReg(RC);
+ Register ImplicitDefReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
- unsigned ResultReg =
+ Register ResultReg =
fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
updateValueMap(I, ResultReg);
return true;
@@ -2474,7 +2466,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
"Instruction must be an FPExt or FPTrunc!");
bool HasAVX = Subtarget->hasAVX();
- unsigned OpReg = getRegForValue(I->getOperand(0));
+ Register OpReg = getRegForValue(I->getOperand(0));
if (OpReg == 0)
return false;
@@ -2486,7 +2478,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
}
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
MachineInstrBuilder MIB;
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
ResultReg);
@@ -2537,7 +2529,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
if (!TLI.isTypeLegal(SrcVT))
return false;
- unsigned InputReg = getRegForValue(I->getOperand(0));
+ Register InputReg = getRegForValue(I->getOperand(0));
if (!InputReg)
// Unhandled operand. Halt "fast" selection and bail.
return false;
@@ -2549,7 +2541,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
}
// Issue an extract_subreg.
- unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
+ Register ResultReg = fastEmitInst_extractsubreg(MVT::i8,
InputReg, false,
X86::sub_8bit);
if (!ResultReg)
@@ -2608,7 +2600,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
return false;
const Value *Op = II->getArgOperand(0);
- unsigned InputReg = getRegForValue(Op);
+ Register InputReg = getRegForValue(Op);
if (InputReg == 0)
return false;
@@ -2632,12 +2624,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// used to provide rounding control: use MXCSR.RC, encoded as 0b100.
// It's consistent with the other FP instructions, which are usually
// controlled by MXCSR.
- InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);
+ unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
+ : X86::VCVTPS2PHrr;
+ InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4);
// Move the lower 32-bits of ResultReg to another register of class GR32.
+ Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
+ : X86::VMOVPDI2DIrr;
ResultReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::VMOVPDI2DIrr), ResultReg)
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addReg(InputReg, RegState::Kill);
// The result value is in the lower 16-bits of ResultReg.
@@ -2645,19 +2640,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
} else {
assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
- // Explicitly sign-extend the input to 32-bit.
- InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+ // Explicitly zero-extend the input to 32-bit.
+ InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
/*Kill=*/false);
// The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
InputReg, /*Kill=*/true);
- InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+ unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
+ : X86::VCVTPH2PSrr;
+ InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Kill=*/true);
// The result value is in the lower 32-bits of ResultReg.
// Emit an explicit copy from register class VR128 to register class FR32.
- ResultReg = createResultReg(&X86::FR32RegClass);
+ ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(InputReg, RegState::Kill);
@@ -2700,7 +2697,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
// Always make a copy of the frame register to a vreg first, so that we
// never directly reference the frame register (the TwoAddressInstruction-
// Pass doesn't like that).
- unsigned SrcReg = createResultReg(RC);
+ Register SrcReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
@@ -2830,7 +2827,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
}
const Value *SrcVal = II->getArgOperand(0);
- unsigned SrcReg = getRegForValue(SrcVal);
+ Register SrcReg = getRegForValue(SrcVal);
if (SrcReg == 0)
return false;
@@ -2843,7 +2840,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
}
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
MachineInstrBuilder MIB;
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
ResultReg);
@@ -2903,7 +2900,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
}
- unsigned LHSReg = getRegForValue(LHS);
+ Register LHSReg = getRegForValue(LHS);
if (LHSReg == 0)
return false;
bool LHSIsKill = hasTrivialKill(LHS);
@@ -2974,7 +2971,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
return false;
// Assign to a GPR since the overflow return value is lowered to a SETcc.
- unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
+ Register ResultReg2 = createResultReg(&X86::GR8RegClass);
assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
ResultReg2).addImm(CondCode);
@@ -3041,11 +3038,11 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
Op = IE->getOperand(0);
}
- unsigned Reg = getRegForValue(Op);
+ Register Reg = getRegForValue(Op);
if (Reg == 0)
return false;
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addReg(Reg);
@@ -3139,11 +3136,11 @@ bool X86FastISel::fastLowerArguments() {
case MVT::f32: LLVM_FALLTHROUGH;
case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
}
- unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
// FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
// Without this, EmitLiveInCopies may eliminate the livein if its only
// use is a bitcast (which isn't turned into an instruction).
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), ResultReg)
.addReg(DstReg, getKillRegState(true));
@@ -3154,7 +3151,7 @@ bool X86FastISel::fastLowerArguments() {
static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
CallingConv::ID CC,
- ImmutableCallSite *CS) {
+ const CallBase *CB) {
if (Subtarget->is64Bit())
return 0;
if (Subtarget->getTargetTriple().isOSMSVCRT())
@@ -3163,9 +3160,9 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
CC == CallingConv::HiPE || CC == CallingConv::Tail)
return 0;
- if (CS)
- if (CS->arg_empty() || !CS->paramHasAttr(0, Attribute::StructRet) ||
- CS->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
+ if (CB)
+ if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) ||
+ CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
return 0;
return 4;
@@ -3186,14 +3183,12 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
bool Is64Bit = Subtarget->is64Bit();
bool IsWin64 = Subtarget->isCallingConvWin64(CC);
- const CallInst *CI =
- CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
+ const CallInst *CI = dyn_cast_or_null<CallInst>(CLI.CB);
const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;
// Call / invoke instructions with NoCfCheck attribute require special
// handling.
- const auto *II =
- CLI.CS ? dyn_cast<InvokeInst>(CLI.CS->getInstruction()) : nullptr;
+ const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()))
return false;
@@ -3239,11 +3234,11 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
return false;
// Don't know about inalloca yet.
- if (CLI.CS && CLI.CS->hasInAllocaArgument())
+ if (CLI.CB && CLI.CB->hasInAllocaArgument())
return false;
for (auto Flag : CLI.OutFlags)
- if (Flag.isSwiftError())
+ if (Flag.isSwiftError() || Flag.isPreallocated())
return false;
SmallVector<MVT, 16> OutVTs;
@@ -3269,9 +3264,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
MVT VT;
auto *TI = dyn_cast<TruncInst>(Val);
unsigned ResultReg;
- if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
- (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
- TI->hasOneUse()) {
+ if (TI && TI->getType()->isIntegerTy(1) && CLI.CB &&
+ (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) {
Value *PrevVal = TI->getOperand(0);
ResultReg = getRegForValue(PrevVal);
@@ -3284,7 +3278,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
ResultReg =
fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
} else {
- if (!isTypeLegal(Val->getType(), VT))
+ if (!isTypeLegal(Val->getType(), VT) ||
+ (VT.isVector() && VT.getVectorElementType() == MVT::i1))
return false;
ResultReg = getRegForValue(Val);
}
@@ -3302,7 +3297,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Allocate shadow area for Win64
if (IsWin64)
- CCInfo.AllocateStack(32, 8);
+ CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
@@ -3406,7 +3401,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
OutRegs.push_back(VA.getLocReg());
} else {
- assert(VA.isMemLoc());
+ assert(VA.isMemLoc() && "Unknown value location!");
// Don't emit stores for undef values.
if (isa<UndefValue>(ArgVal))
@@ -3417,7 +3412,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
AM.Base.Reg = RegInfo->getStackRegister();
AM.Disp = LocMemOffset;
ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
- unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+ Align Alignment = DL.getABITypeAlign(ArgVal->getType());
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
@@ -3537,7 +3532,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
TM.Options.GuaranteedTailCallOpt)
? NumBytes // Callee pops everything.
- : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS);
+ : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB);
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
.addImm(NumBytes).addImm(NumBytesForCalleeToPop);
@@ -3549,7 +3544,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
- unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+ Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign &VA = RVLocs[i];
EVT CopyVT = VA.getValVT();
@@ -3582,7 +3577,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
EVT ResVT = VA.getValVT();
unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
unsigned MemSize = ResVT.getSizeInBits()/8;
- int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+ int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false);
addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc)), FI)
.addReg(CopyReg);
@@ -3647,7 +3642,7 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
return X86SelectZExt(I);
if (DstVT.bitsLT(SrcVT))
return X86SelectTrunc(I);
- unsigned Reg = getRegForValue(I->getOperand(0));
+ Register Reg = getRegForValue(I->getOperand(0));
if (Reg == 0) return false;
updateValueMap(I, Reg);
return true;
@@ -3668,13 +3663,18 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
DstVT.getVectorElementType() == MVT::i1)
return false;
- unsigned Reg = getRegForValue(I->getOperand(0));
- if (Reg == 0)
+ Register Reg = getRegForValue(I->getOperand(0));
+ if (!Reg)
return false;
- // No instruction is needed for conversion. Reuse the register used by
- // the fist operand.
- updateValueMap(I, Reg);
+ // Emit a reg-reg copy so we don't propagate cached known bits information
+ // with the wrong VT if we fall out of fast isel after selecting this.
+ const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
+ Register ResultReg = createResultReg(DstClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
+
+ updateValueMap(I, ResultReg);
return true;
}
}
@@ -3688,7 +3688,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
uint64_t Imm = CI->getZExtValue();
if (Imm == 0) {
- unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+ Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type");
case MVT::i1:
@@ -3701,7 +3701,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
case MVT::i32:
return SrcReg;
case MVT::i64: {
- unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+ Register ResultReg = createResultReg(&X86::GR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
.addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
@@ -3769,11 +3769,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
}
// MachineConstantPool wants an explicit alignment.
- unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
- if (Align == 0) {
- // Alignment of vector types. FIXME!
- Align = DL.getTypeAllocSize(CFP->getType());
- }
+ Align Alignment = DL.getPrefTypeAlign(CFP->getType());
// x86-32 PIC requires a PIC base register for constant pools.
unsigned PICBase = 0;
@@ -3786,11 +3782,12 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
PICBase = X86::RIP;
// Create the load from the constant pool.
- unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+ unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
- if (CM == CodeModel::Large) {
- unsigned AddrReg = createResultReg(&X86::GR64RegClass);
+ // Large code model only applies to 64-bit mode.
+ if (Subtarget->is64Bit() && CM == CodeModel::Large) {
+ Register AddrReg = createResultReg(&X86::GR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
AddrReg)
.addConstantPoolIndex(CPI, 0, OpFlag);
@@ -3799,7 +3796,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
addDirectMem(MIB, AddrReg);
MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
MachinePointerInfo::getConstantPool(*FuncInfo.MF),
- MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
+ MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
MIB->addMemOperand(*FuncInfo.MF, MMO);
return ResultReg;
}
@@ -3824,7 +3821,7 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
return AM.Base.Reg;
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
if (TM.getRelocationModel() == Reloc::Static &&
TLI.getPointerTy(DL) == MVT::i64) {
// The displacement code could be more than 32 bits away so we need to use
@@ -3883,7 +3880,7 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
: X86::LEA64r;
const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg), AM);
return ResultReg;
@@ -3916,7 +3913,7 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
return 0;
}
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
return ResultReg;
}
@@ -3932,16 +3929,12 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
const X86InstrInfo &XII = (const X86InstrInfo &)TII;
unsigned Size = DL.getTypeAllocSize(LI->getType());
- unsigned Alignment = LI->getAlignment();
-
- if (Alignment == 0) // Ensure that codegen never sees alignment 0
- Alignment = DL.getABITypeAlignment(LI->getType());
SmallVector<MachineOperand, 8> AddrOps;
AM.getFullAddress(AddrOps);
MachineInstr *Result = XII.foldMemoryOperandImpl(
- *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+ *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(),
/*AllowCommute=*/true);
if (!Result)
return false;
@@ -3958,7 +3951,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
continue;
// Found the index reg, now try to rewrite it.
- unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
+ Register IndexReg = constrainOperandRegClass(Result->getDesc(),
MO.getReg(), OperandNo);
if (IndexReg == MO.getReg())
continue;
@@ -3980,7 +3973,7 @@ unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
unsigned Op3, bool Op3IsKill) {
const MCInstrDesc &II = TII.get(MachineInstOpcode);
- unsigned ResultReg = createResultReg(RC);
+ Register ResultReg = createResultReg(RC);
Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index f8c4a2adb851..78de041329e2 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -350,7 +350,7 @@ MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode,
return nullptr;
// Don't interfere with formation of CBW instructions which should be a
- // shorter encoding than even the MOVSX32rr8. It's also immunte to partial
+ // shorter encoding than even the MOVSX32rr8. It's also immune to partial
// merge issues on Intel CPUs.
if (MI->getOpcode() == X86::MOVSX16rr8 &&
MI->getOperand(0).getReg() == X86::AX &&
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 9ac401bb0253..424279038921 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -16,8 +16,11 @@
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/Debug.h"
@@ -111,6 +114,12 @@ public:
MachineFunctionProperties::Property::NoVRegs);
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
private:
TargetSchedModel TSM;
const X86InstrInfo *TII = nullptr;
@@ -205,21 +214,27 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
TSM.init(&ST);
TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *MBFI = (PSI && PSI->hasProfileSummary())
+ ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI()
+ : nullptr;
LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
for (MachineBasicBlock &MBB : MF) {
// First pass. Try to remove or optimize existing LEAs.
+ bool OptIncDecPerBB =
+ OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
if (!isLEA(I->getOpcode()))
continue;
- if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
+ if (optTwoAddrLEA(I, MBB, OptIncDecPerBB, UseLEAForSP))
continue;
if (IsSlowLEA)
processInstructionForSlowLEA(I, MBB);
else if (IsSlow3OpsLEA)
- processInstrForSlow3OpLEA(I, MBB, OptIncDec);
+ processInstrForSlow3OpLEA(I, MBB, OptIncDecPerBB);
}
// Second pass for creating LEAs. This may reverse some of the
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 924f429fc138..09668d7c5468 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -36,6 +36,8 @@ STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");
namespace {
class X86FixupSetCCPass : public MachineFunctionPass {
public:
+ static char ID;
+
X86FixupSetCCPass() : MachineFunctionPass(ID) {}
StringRef getPassName() const override { return "X86 Fixup SetCC"; }
@@ -47,12 +49,12 @@ private:
const X86InstrInfo *TII = nullptr;
enum { SearchBound = 16 };
-
- static char ID;
};
+} // end anonymous namespace
char X86FixupSetCCPass::ID = 0;
-}
+
+INITIALIZE_PASS(X86FixupSetCCPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index b1d2de29c896..831695dabcd8 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -124,10 +124,6 @@ private:
MachineInstr &JmpI, CondRegArray &CondRegs);
void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
MachineInstr &CopyDefI);
- void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
- MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc, MachineInstr &SetBI,
- MachineOperand &FlagUse, CondRegArray &CondRegs);
void rewriteSetCC(MachineBasicBlock &TestMBB,
MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
MachineInstr &SetCCI, MachineOperand &FlagUse,
@@ -165,6 +161,7 @@ enum class FlagArithMnemonic {
RCL,
RCR,
SBB,
+ SETB,
};
} // namespace
@@ -235,6 +232,10 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
case X86::ADOX32rm:
case X86::ADOX64rm:
return FlagArithMnemonic::ADOX;
+
+ case X86::SETB_C32r:
+ case X86::SETB_C64r:
+ return FlagArithMnemonic::SETB;
}
}
@@ -638,24 +639,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
// logic.
FlagsKilled = true;
- switch (MI.getOpcode()) {
- case X86::SETB_C8r:
- case X86::SETB_C16r:
- case X86::SETB_C32r:
- case X86::SETB_C64r:
- // Use custom lowering for arithmetic that is merely extending the
- // carry flag. We model this as the SETB_C* pseudo instructions.
- rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
- CondRegs);
- break;
-
- default:
- // Generically handle remaining uses as arithmetic instructions.
- rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
- CondRegs);
- break;
- }
- break;
+ // Generically handle remaining uses as arithmetic instructions.
+ rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
+ CondRegs);
}
// If this was the last use of the flags, we're done.
@@ -821,6 +807,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
case FlagArithMnemonic::RCL:
case FlagArithMnemonic::RCR:
case FlagArithMnemonic::SBB:
+ case FlagArithMnemonic::SETB:
Cond = X86::COND_B; // CF == 1
// Set up an addend that when one is added will need a carry due to not
// having a higher bit available.
@@ -959,130 +946,6 @@ void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
MI.eraseFromParent();
}
-void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
- MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
- DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
- CondRegArray &CondRegs) {
- // This routine is only used to handle pseudos for setting a register to zero
- // or all ones based on CF. This is essentially the sign extended from 1-bit
- // form of SETB and modeled with the SETB_C* pseudos. They require special
- // handling as they aren't normal SETcc instructions and are lowered to an
- // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
- // they are only provided in reg-defining forms. A complicating factor is that
- // they can define many different register widths.
- assert(SetBI.getOperand(0).isReg() &&
- "Cannot have a non-register defined operand to this variant of SETB!");
-
- // Little helper to do the common final step of replacing the register def'ed
- // by this SETB instruction with a new register and removing the SETB
- // instruction.
- auto RewriteToReg = [&](unsigned Reg) {
- MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
- SetBI.eraseFromParent();
- };
-
- // Grab the register class used for this particular instruction.
- auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
-
- MachineBasicBlock &MBB = *SetBI.getParent();
- auto SetPos = SetBI.getIterator();
- auto SetLoc = SetBI.getDebugLoc();
-
- auto AdjustReg = [&](unsigned Reg) {
- auto &OrigRC = *MRI->getRegClass(Reg);
- if (&OrigRC == &SetBRC)
- return Reg;
-
- unsigned NewReg;
-
- int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
- int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
- assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
- assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
- int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
- X86::NoSubRegister, X86::sub_32bit};
-
- // If the original size is smaller than the target *and* is smaller than 4
- // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
- // to maximize the chance of being able to CSE that operation and to avoid
- // partial dependency stalls extending to 2-bytes.
- if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
- NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
- BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
- .addReg(Reg);
- if (&SetBRC == &X86::GR32RegClass)
- return NewReg;
- Reg = NewReg;
- OrigRegSize = 4;
- }
-
- NewReg = MRI->createVirtualRegister(&SetBRC);
- if (OrigRegSize < TargetRegSize) {
- BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
- NewReg)
- .addImm(0)
- .addReg(Reg)
- .addImm(SubRegIdx[OrigRegSize]);
- } else if (OrigRegSize > TargetRegSize) {
- if (TargetRegSize == 1 && !Subtarget->is64Bit()) {
- // Need to constrain the register class.
- MRI->constrainRegClass(Reg, &X86::GR32_ABCDRegClass);
- }
-
- BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY),
- NewReg)
- .addReg(Reg, 0, SubRegIdx[TargetRegSize]);
- } else {
- BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
- .addReg(Reg);
- }
- return NewReg;
- };
-
- unsigned &CondReg = CondRegs[X86::COND_B];
- if (!CondReg)
- CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
-
- // Adjust the condition to have the desired register width by zero-extending
- // as needed.
- // FIXME: We should use a better API to avoid the local reference and using a
- // different variable here.
- unsigned ExtCondReg = AdjustReg(CondReg);
-
- // Now we need to turn this into a bitmask. We do this by subtracting it from
- // zero.
- Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
- BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
- ZeroReg = AdjustReg(ZeroReg);
-
- unsigned Sub;
- switch (SetBI.getOpcode()) {
- case X86::SETB_C8r:
- Sub = X86::SUB8rr;
- break;
-
- case X86::SETB_C16r:
- Sub = X86::SUB16rr;
- break;
-
- case X86::SETB_C32r:
- Sub = X86::SUB32rr;
- break;
-
- case X86::SETB_C64r:
- Sub = X86::SUB64rr;
- break;
-
- default:
- llvm_unreachable("Invalid SETB_C* opcode!");
- }
- Register ResultReg = MRI->createVirtualRegister(&SetBRC);
- BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
- .addReg(ZeroReg)
- .addReg(ExtCondReg);
- return RewriteToReg(ResultReg);
-}
-
void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 13bbd6ccfce4..e6ee46957500 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -1364,6 +1364,9 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
MBB->remove(&*I++);
I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+ if (!MI.mayRaiseFPException())
+ I->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
// If both operands are killed, pop one off of the stack in addition to
// overwriting the other one.
if (KillsOp0 && KillsOp1 && Op0 != Op1) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
index 1da20371caf5..c7ca6fb2a4fc 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -17,6 +17,7 @@
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -32,6 +33,12 @@
#include "llvm/Target/TargetOptions.h"
#include <cstdlib>
+#define DEBUG_TYPE "x86-fl"
+
+STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
+STATISTIC(NumFrameExtraProbe,
+ "Number of extra stack probes generated in prologue");
+
using namespace llvm;
X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
@@ -50,7 +57,8 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
return !MF.getFrameInfo().hasVarSizedObjects() &&
- !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+ !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
+ !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
}
/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
@@ -60,6 +68,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
bool
X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
return hasReservedCallFrame(MF) ||
+ MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
(hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
TRI->hasBasePointer(MF);
}
@@ -83,10 +92,10 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
- TRI->needsStackRealignment(MF) ||
- MFI.hasVarSizedObjects() ||
+ TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
MFI.hasStackMap() || MFI.hasPatchPoint() ||
MFI.hasCopyImplyingStackAdjustment());
@@ -257,7 +266,20 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
uint64_t Chunk = (1LL << 31) - 1;
- if (Offset > Chunk) {
+ MachineFunction &MF = *MBB.getParent();
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
+
+ // It's ok to not take into account large chunks when probing, as the
+ // allocation is split in smaller chunks anyway.
+ if (EmitInlineStackProbe && !InEpilogue) {
+
+ // This pseudo-instruction is going to be expanded, potentially using a
+ // loop, by inlineStackProbe().
+ BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset);
+ return;
+ } else if (Offset > Chunk) {
// Rather than emit a long series of instructions for large offsets,
// load the offset into a register and do one sub/add
unsigned Reg = 0;
@@ -381,8 +403,8 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
} else {
bool IsSub = Offset < 0;
uint64_t AbsOffset = IsSub ? -Offset : Offset;
- unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
- : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+ const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+ : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
.addReg(StackPtr)
.addImm(AbsOffset);
@@ -457,9 +479,32 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
.addCFIIndex(CFIIndex);
}
+/// Emits Dwarf Info specifying offsets of callee saved registers and
+/// frame pointer. This is called only when basic block sections are enabled.
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+ MachineFunction &MF = *MBB.getParent();
+ if (!hasFP(MF)) {
+ emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
+ return;
+ }
+ const MachineModuleInfo &MMI = MF.getMMI();
+ const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+ const unsigned FramePtr = TRI->getFrameRegister(MF);
+ const unsigned MachineFramePtr =
+ STI.isTarget64BitILP32() ? unsigned(getX86SubSuperRegister(FramePtr, 64))
+ : FramePtr;
+ unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
+ // Offset = space for return address + size of the frame pointer itself.
+ unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
+ BuildCFI(MBB, MBBI, DebugLoc{},
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
+ emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
+}
+
void X86FrameLowering::emitCalleeSavedFrameMoves(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL) const {
+ const DebugLoc &DL, bool IsPrologue) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
@@ -474,10 +519,15 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
I = CSI.begin(), E = CSI.end(); I != E; ++I) {
int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
unsigned Reg = I->getReg();
-
unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
- BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+
+ if (IsPrologue) {
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+ } else {
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::createRestore(nullptr, DwarfReg));
+ }
}
}
@@ -488,7 +538,8 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF,
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
if (STI.isTargetWindowsCoreCLR()) {
if (InProlog) {
- emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING))
+ .addImm(0 /* no explicit stack size */);
} else {
emitStackProbeInline(MF, MBB, MBBI, DL, false);
}
@@ -499,26 +550,13 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF,
void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
MachineBasicBlock &PrologMBB) const {
- const StringRef ChkStkStubSymbol = "__chkstk_stub";
- MachineInstr *ChkStkStub = nullptr;
-
- for (MachineInstr &MI : PrologMBB) {
- if (MI.isCall() && MI.getOperand(0).isSymbol() &&
- ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
- ChkStkStub = &MI;
- break;
- }
- }
-
- if (ChkStkStub != nullptr) {
- assert(!ChkStkStub->isBundled() &&
- "Not expecting bundled instructions here");
- MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
- assert(std::prev(MBBI) == ChkStkStub &&
- "MBBI expected after __chkstk_stub.");
- DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
- emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
- ChkStkStub->eraseFromParent();
+ auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
+ return MI.getOpcode() == X86::STACKALLOC_W_PROBING;
+ });
+ if (Where != PrologMBB.end()) {
+ DebugLoc DL = PrologMBB.findDebugLoc(Where);
+ emitStackProbeInline(MF, PrologMBB, Where, DL, true);
+ Where->eraseFromParent();
}
}
@@ -528,6 +566,167 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
const DebugLoc &DL,
bool InProlog) const {
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())
+ emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);
+ else
+ emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);
+}
+
+void X86FrameLowering::emitStackProbeInlineGeneric(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+ MachineInstr &AllocWithProbe = *MBBI;
+ uint64_t Offset = AllocWithProbe.getOperand(0).getImm();
+
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
+ "different expansion expected for CoreCLR 64 bit");
+
+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+ uint64_t ProbeChunk = StackProbeSize * 8;
+
+ // Synthesize a loop or unroll it, depending on the number of iterations.
+ if (Offset > ProbeChunk) {
+ emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset);
+ } else {
+ emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset);
+ }
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericBlock(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ uint64_t Offset) const {
+
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+ uint64_t CurrentOffset = 0;
+ // 0 Thanks to return address being saved on the stack
+ uint64_t CurrentProbeOffset = 0;
+
+ // For the first N - 1 pages, just probe. I tried to take advantage of
+ // natural probes but it implies much more logic and there was very few
+ // interesting natural probes to interleave.
+ while (CurrentOffset + StackProbeSize < Offset) {
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+
+ addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ NumFrameExtraProbe++;
+ CurrentOffset += StackProbeSize;
+ CurrentProbeOffset += StackProbeSize;
+ }
+
+ uint64_t ChunkSize = Offset - CurrentOffset;
+ MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(ChunkSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericLoop(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ uint64_t Offset) const {
+ assert(Offset && "null offset");
+
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const X86TargetLowering &TLI = *STI.getTargetLowering();
+ const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+ const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+
+ // Synthesize a loop
+ NumFrameLoopProbe++;
+ const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+ MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB.getIterator();
+ MF.insert(MBBIter, testMBB);
+ MF.insert(MBBIter, tailMBB);
+
+ Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
+ .addReg(StackPtr)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // save loop bound
+ {
+ const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+ BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed)
+ .addReg(FinalStackProbed)
+ .addImm(Offset / StackProbeSize * StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // allocate a page
+ {
+ const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+ BuildMI(testMBB, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(StackProbeSize)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // touch the page
+ addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
+ .setMIFlag(MachineInstr::FrameSetup),
+ StackPtr, false, 0)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // cmp with stack pointer bound
+ BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(StackPtr)
+ .addReg(FinalStackProbed)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // jump
+ BuildMI(testMBB, DL, TII.get(X86::JCC_1))
+ .addMBB(testMBB)
+ .addImm(X86::COND_NE)
+ .setMIFlag(MachineInstr::FrameSetup);
+ testMBB->addSuccessor(testMBB);
+ testMBB->addSuccessor(tailMBB);
+
+ // BB management
+ tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
+ tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ MBB.addSuccessor(testMBB);
+
+ // handle tail
+ unsigned TailOffset = Offset % StackProbeSize;
+ if (TailOffset) {
+ const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset);
+ BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr)
+ .addImm(TailOffset)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+
+ // Update Live In information
+ recomputeLiveIns(*testMBB);
+ recomputeLiveIns(*tailMBB);
+}
+
+void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
assert(STI.is64Bit() && "different expansion needed for 32 bit");
assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
const TargetInstrInfo &TII = *STI.getInstrInfo();
@@ -821,16 +1020,6 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
}
}
-void X86FrameLowering::emitStackProbeInlineStub(
- MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
-
- assert(InProlog && "ChkStkStub called outside prolog!");
-
- BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
- .addExternalSymbol("__chkstk_stub");
-}
-
static unsigned calculateSetFPREG(uint64_t SPAdjust) {
// Win64 ABI has a less restrictive limitation of 240; 128 works equally well
// and might require smaller successive adjustments.
@@ -846,15 +1035,15 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) {
// go with the minimum SlotSize.
uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
- unsigned StackAlign = getStackAlignment();
+ Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.
+ Align StackAlign = getStackAlign();
if (MF.getFunction().hasFnAttribute("stackrealign")) {
if (MFI.hasCalls())
MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
else if (MaxAlign < SlotSize)
- MaxAlign = SlotSize;
+ MaxAlign = Align(SlotSize);
}
- return MaxAlign;
+ return MaxAlign.value();
}
void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
@@ -1014,7 +1203,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
X86FI->setCalleeSavedFrameSize(
X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
- bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
+ const bool EmitStackProbeCall =
+ STI.getTargetLowering()->hasStackProbeSymbol(MF);
unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
// Re-align the stack on 64-bit if the x86-interrupt calling convention is
@@ -1032,11 +1222,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// pointer, calls, or dynamic alloca then we do not need to adjust the
// stack pointer (we fit in the Red Zone). We also check that we don't
// push and pop from the stack.
- if (has128ByteRedZone(MF) &&
- !TRI->needsStackRealignment(MF) &&
+ if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) &&
!MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
- !UseStackProbe && // No stack probes.
+ !EmitStackProbeCall && // No stack probes.
!MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
!MF.shouldSplitStack()) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
@@ -1115,7 +1304,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Define the current CFA rule to use the provided offset.
assert(StackSize);
BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth));
// Change the rule for the FramePtr to be an "offset" rule.
unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
@@ -1192,7 +1381,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Define the current CFA rule to use the provided offset.
assert(StackSize);
BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset));
StackOffset += stackGrowth;
}
@@ -1237,7 +1426,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
uint64_t AlignedNumBytes = NumBytes;
if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
- if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+ if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
assert(!X86FI->getUsesRedZone() &&
"The Red Zone is not accounted for in stack probes");
@@ -1323,17 +1512,17 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
Establisher, false, PSPSlotOffset)
.addMemOperand(MF.getMachineMemOperand(
- NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+ NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize)));
;
// Save the root establisher back into the current funclet's (mostly
// empty) frame, in case a sub-funclet or the GC needs it.
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
false, PSPSlotOffset)
.addReg(Establisher)
- .addMemOperand(
- MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile,
- SlotSize, SlotSize));
+ .addMemOperand(MF.getMachineMemOperand(
+ NoInfo,
+ MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+ SlotSize, Align(SlotSize)));
}
SPOrEstablisher = Establisher;
} else {
@@ -1370,7 +1559,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// into the registration node so that the runtime will restore it for us.
if (!MBB.isCleanupFuncletEntry()) {
assert(Personality == EHPersonality::MSVC_CXX);
- unsigned FrameReg;
+ Register FrameReg;
int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
// ESP is the first field, so no extra displacement is needed.
@@ -1389,7 +1578,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
if (X86::FR64RegClass.contains(Reg)) {
int Offset;
- unsigned IgnoredFrameReg;
+ Register IgnoredFrameReg;
if (IsWin64Prologue && IsFunclet)
Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
else
@@ -1423,7 +1612,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
.addReg(StackPtr)
.addMemOperand(MF.getMachineMemOperand(
PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
- SlotSize, SlotSize));
+ SlotSize, Align(SlotSize)));
}
// Realign stack after we spilled callee-saved registers (so that we'll be
@@ -1464,7 +1653,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// it recovers the frame pointer from the base pointer rather than the
// other way around.
unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
- unsigned UsedReg;
+ Register UsedReg;
int Offset =
getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
assert(UsedReg == BasePtr);
@@ -1479,12 +1668,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (!HasFP && NumBytes) {
// Define the current CFA rule to use the provided offset.
assert(StackSize);
- BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
- nullptr, -StackSize + stackGrowth));
+ BuildCFI(
+ MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth));
}
// Emit DWARF info specifying the offsets of the callee-saved registers.
- emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+ emitCalleeSavedFrameMoves(MBB, MBBI, DL, true);
}
// X86 Interrupt handling function cannot assume anything about the direction
@@ -1541,7 +1731,7 @@ static bool isFuncletReturnInstr(MachineInstr &MI) {
unsigned
X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
- unsigned SPReg;
+ Register SPReg;
int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
/*IgnoreSPUpdates*/ true);
assert(Offset >= 0 && SPReg == TRI->getStackRegister());
@@ -1573,7 +1763,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
// RBP is not included in the callee saved register block. After pushing RBP,
// everything is 16 byte aligned. Everything we allocate before an outgoing
// call must also be 16 byte aligned.
- unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
+ unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign());
// Subtract out the size of the callee saved registers. This is how much stack
// each funclet will allocate.
return FrameSizeMinusRBP + XMMSize - CSSize;
@@ -1634,6 +1824,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
uint64_t SEHStackAllocAmt = NumBytes;
+ // AfterPop is the position to insert .cfi_restore.
+ MachineBasicBlock::iterator AfterPop = MBBI;
if (HasFP) {
// Pop EBP.
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
@@ -1642,8 +1834,15 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (NeedsDwarfCFI) {
unsigned DwarfStackPtr =
TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
- BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
- nullptr, DwarfStackPtr, -SlotSize));
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize));
+ if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
+ unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+ BuildCFI(MBB, AfterPop, DL,
+ MCCFIInstruction::createRestore(nullptr, DwarfFramePtr));
+ --MBBI;
+ --AfterPop;
+ }
--MBBI;
}
}
@@ -1711,8 +1910,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
if (!hasFP(MF) && NeedsDwarfCFI) {
// Define the current CFA rule to use the provided offset.
- BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
- nullptr, -CSSize - SlotSize));
+ BuildCFI(MBB, MBBI, DL,
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, CSSize + SlotSize));
}
--MBBI;
}
@@ -1738,11 +1937,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (Opc == X86::POP32r || Opc == X86::POP64r) {
Offset += SlotSize;
BuildCFI(MBB, MBBI, DL,
- MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+ MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset));
}
}
}
+ // Emit DWARF info specifying the restores of the callee-saved registers.
+ // For epilogue with return inside or being other block without successor,
+ // no need to generate .cfi_restore for callee-saved registers.
+ if (NeedsDwarfCFI && !MBB.succ_empty() && !MBB.isReturnBlock()) {
+ emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false);
+ }
+
if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
int Offset = -1 * X86FI->getTCReturnAddrDelta();
@@ -1756,7 +1962,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const {
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
bool IsFixed = MFI.isFixedObjectIndex(FI);
@@ -1821,7 +2027,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
// Skip the saved EBP.
return Offset + SlotSize + FPDelta;
} else {
- assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+ assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
return Offset + StackSize;
}
} else if (TRI->needsStackRealignment(MF)) {
@@ -1829,7 +2035,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
// Skip the saved EBP.
return Offset + SlotSize + FPDelta;
} else {
- assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+ assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
return Offset + StackSize;
}
// FIXME: Support tail calls
@@ -1849,8 +2055,8 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
return Offset + FPDelta;
}
-int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
- int FI, unsigned &FrameReg) const {
+int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
+ Register &FrameReg) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
@@ -1860,21 +2066,21 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
return getFrameIndexReference(MF, FI, FrameReg);
FrameReg = TRI->getStackRegister();
- return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+ return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
+ it->second;
}
int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
- int FI, unsigned &FrameReg,
+ int FI, Register &FrameReg,
int Adjustment) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
FrameReg = TRI->getStackRegister();
return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
}
-int
-X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
- int FI, unsigned &FrameReg,
- bool IgnoreSPUpdates) const {
+int X86FrameLowering::getFrameIndexReferencePreferSP(
+ const MachineFunction &MF, int FI, Register &FrameReg,
+ bool IgnoreSPUpdates) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
// Does not include any dynamic realign.
@@ -1985,7 +2191,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
if (this->TRI->hasBasePointer(MF)) {
// Allocate a spill slot for EBP if we have a base pointer and EH funclets.
if (MF.hasEHFunclets()) {
- int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
+ int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize));
X86FI->setHasSEHFramePtrSave(true);
X86FI->setSEHFramePtrSaveIndex(FI);
}
@@ -2038,16 +2244,16 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
unsigned Size = TRI->getSpillSize(*RC);
- unsigned Align = TRI->getSpillAlignment(*RC);
+ Align Alignment = TRI->getSpillAlign(*RC);
// ensure alignment
assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
- SpillSlotOffset = -alignTo(-SpillSlotOffset, Align);
+ SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment);
// spill into slot
SpillSlotOffset -= Size;
int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
CSI[i - 1].setFrameIdx(SlotIndex);
- MFI.ensureMaxAlignment(Align);
+ MFI.ensureMaxAlignment(Alignment);
// Save the start offset and size of XMM in stack frame for funclets.
if (X86::VR128RegClass.contains(Reg)) {
@@ -2061,8 +2267,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
bool X86FrameLowering::spillCalleeSavedRegisters(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
DebugLoc DL = MBB.findDebugLoc(MI);
// Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
@@ -2161,10 +2366,9 @@ void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
CatchRetTarget->setHasAddressTaken();
}
-bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const {
+bool X86FrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
if (CSI.empty())
return false;
@@ -2799,6 +3003,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
I = MBB.erase(I);
auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
+ // Try to avoid emitting dead SP adjustments if the block end is unreachable,
+ // typically because the function is marked noreturn (abort, throw,
+ // assert_fail, etc).
+ if (isDestroy && blockEndIsUnreachable(MBB, I))
+ return I;
+
if (!reserveCallFrame) {
// If the stack pointer can be changed after prologue, turn the
// adjcallstackup instruction into a 'sub ESP, <amt>' and the
@@ -2807,8 +3017,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// We need to keep the stack aligned properly. To do this, we round the
// amount of space needed for the outgoing arguments up to the next
// alignment boundary.
- unsigned StackAlign = getStackAlignment();
- Amount = alignTo(Amount, StackAlign);
+ Amount = alignTo(Amount, getStackAlign());
const Function &F = MF.getFunction();
bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
@@ -2881,13 +3090,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
return I;
}
- if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) {
- // If we are performing frame pointer elimination and if the callee pops
- // something off the stack pointer, add it back. We do this until we have
- // more advanced stack pointer tracking ability.
- // We are not tracking the stack pointer adjustment by the callee, so make
- // sure we restore the stack pointer immediately after the call, there may
- // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+ if (InternalAmt) {
MachineBasicBlock::iterator CI = I;
MachineBasicBlock::iterator B = MBB.begin();
while (CI != B && !std::prev(CI)->isCall())
@@ -2964,7 +3167,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
.setMIFlag(MachineInstr::FrameSetup);
}
- unsigned UsedReg;
+ Register UsedReg;
int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
int EndOffset = -EHRegOffset - EHRegSize;
FuncInfo.EHRegNodeEndOffset = EndOffset;
@@ -3003,8 +3206,8 @@ int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
return TRI->getSlotSize();
}
-unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
- const {
+Register
+X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
return TRI->getDwarfRegNum(StackPtr, true);
}
@@ -3014,7 +3217,7 @@ struct X86FrameSortingObject {
bool IsValid = false; // true if we care about this Object.
unsigned ObjectIndex = 0; // Index of Object into MFI list.
unsigned ObjectSize = 0; // Size of Object in bytes.
- unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+ Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
unsigned ObjectNumUses = 0; // Object static number of uses.
};
@@ -3099,7 +3302,7 @@ void X86FrameLowering::orderFrameObjects(
for (auto &Obj : ObjectsToAllocate) {
SortingObjects[Obj].IsValid = true;
SortingObjects[Obj].ObjectIndex = Obj;
- SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
+ SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj);
// Set the size.
int ObjectSize = MFI.getObjectSize(Obj);
if (ObjectSize == 0)
@@ -3192,7 +3395,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
int FrameIndex = H.CatchObj.FrameIndex;
if (FrameIndex != INT_MAX) {
// Ensure alignment.
- unsigned Align = MFI.getObjectAlignment(FrameIndex);
+ unsigned Align = MFI.getObjectAlign(FrameIndex).value();
MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
@@ -3219,3 +3422,24 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
UnwindHelpFI)
.addImm(-2);
}
+
+void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(
+ MachineFunction &MF, RegScavenger *RS) const {
+ if (STI.is32Bit() && MF.hasEHFunclets())
+ restoreWinEHStackPointersInParent(MF);
+}
+
+void X86FrameLowering::restoreWinEHStackPointersInParent(
+ MachineFunction &MF) const {
+ // 32-bit functions have to restore stack pointers when control is transferred
+ // back to the parent function. These blocks are identified as eh pads that
+ // are not funclet entries.
+ bool IsSEH = isAsynchronousEHPersonality(
+ classifyEHPersonality(MF.getFunction().getPersonalityFn()));
+ for (MachineBasicBlock &MBB : MF) {
+ bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry();
+ if (NeedsRestore)
+ restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(),
+ /*RestoreSP=*/IsSEH);
+ }
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
index 2103d6471ead..c0b4be95f88d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FrameLowering.h
@@ -58,9 +58,14 @@ public:
void inlineStackProbe(MachineFunction &MF,
MachineBasicBlock &PrologMBB) const override;
+ void
+ emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI) const override;
+
void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL) const;
+ const DebugLoc &DL,
+ bool IsPrologue) const override;
/// emitProlog/emitEpilog - These methods insert prolog and epilog code into
/// the function.
@@ -83,13 +88,14 @@ public:
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ ArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;
- bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- std::vector<CalleeSavedInfo> &CSI,
- const TargetRegisterInfo *TRI) const override;
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
bool hasFP(const MachineFunction &MF) const override;
bool hasReservedCallFrame(const MachineFunction &MF) const override;
@@ -97,14 +103,14 @@ public:
bool needsFrameIndexResolution(const MachineFunction &MF) const override;
int getFrameIndexReference(const MachineFunction &MF, int FI,
- unsigned &FrameReg) const override;
+ Register &FrameReg) const override;
- int getWin64EHFrameIndexRef(const MachineFunction &MF,
- int FI, unsigned &SPReg) const;
- int getFrameIndexReferenceSP(const MachineFunction &MF,
- int FI, unsigned &SPReg, int Adjustment) const;
+ int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
+ Register &SPReg) const;
+ int getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+ Register &SPReg, int Adjustment) const;
int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
- unsigned &FrameReg,
+ Register &FrameReg,
bool IgnoreSPUpdates) const override;
MachineBasicBlock::iterator
@@ -116,6 +122,10 @@ public:
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const override;
+ void
+ processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+ RegScavenger *RS) const override;
+
/// Check the instruction before/after the passed instruction. If
/// it is an ADD/SUB/LEA instruction it is deleted argument and the
/// stack adjustment is returned as a positive value for ADD/LEA and
@@ -169,12 +179,14 @@ public:
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, bool RestoreSP = false) const;
+ void restoreWinEHStackPointersInParent(MachineFunction &MF) const;
+
int getInitialCFAOffset(const MachineFunction &MF) const override;
- unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+ Register getInitialCFARegister(const MachineFunction &MF) const override;
/// Return true if the function has a redzone (accessible bytes past the
- /// frame of the top of stack function) as part of it's ABI.
+ /// frame of the top of stack function) as part of it's ABI.
bool has128ByteRedZone(const MachineFunction& MF) const;
private:
@@ -189,11 +201,33 @@ private:
void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, bool InProlog) const;
+ void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const;
+ void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, bool InProlog) const;
+
+ void emitStackProbeInlineGenericBlock(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ uint64_t Offset) const;
+
+ void emitStackProbeInlineGenericLoop(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ uint64_t Offset) const;
/// Emit a stub to later inline the target stack probe.
- void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, bool InProlog) const;
+ MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL,
+ bool InProlog) const;
/// Aligns the stack pointer by ANDing it with -MaxAlign.
void BuildStackAlignAND(MachineBasicBlock &MBB,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 88af0ebcfd0e..3cd80cb04ab8 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -17,8 +17,6 @@
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/ConstantRange.h"
@@ -31,9 +29,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
#include <stdint.h>
using namespace llvm;
@@ -45,6 +40,10 @@ static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
cl::desc("Enable setting constant bits to reduce size of mask immediates"),
cl::Hidden);
+static cl::opt<bool> EnablePromoteAnyextLoad(
+ "x86-promote-anyext-load", cl::init(true),
+ cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
+
//===----------------------------------------------------------------------===//
// Pattern Matcher Implementation
//===----------------------------------------------------------------------===//
@@ -72,14 +71,14 @@ namespace {
const char *ES;
MCSymbol *MCSym;
int JT;
- unsigned Align; // CP alignment.
+ Align Alignment; // CP alignment.
unsigned char SymbolFlags; // X86II::MO_*
bool NegateIndex = false;
X86ISelAddressMode()
: BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
- MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
+ MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {}
bool hasSymbolicDisplacement() const {
return GV != nullptr || CP != nullptr || ES != nullptr ||
@@ -145,7 +144,7 @@ namespace {
dbgs() << MCSym;
else
dbgs() << "nul";
- dbgs() << " JT" << JT << " Align" << Align << '\n';
+ dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
}
#endif
};
@@ -161,10 +160,6 @@ namespace {
/// make the right decision when generating code for different targets.
const X86Subtarget *Subtarget;
- /// If true, selector should try to optimize for code size instead of
- /// performance.
- bool OptForSize;
-
/// If true, selector should try to optimize for minimum code size.
bool OptForMinSize;
@@ -173,7 +168,7 @@ namespace {
public:
explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
OptForMinSize(false), IndirectTlsSegRefs(false) {}
StringRef getPassName() const override {
@@ -187,16 +182,15 @@ namespace {
"indirect-tls-seg-refs");
// OptFor[Min]Size are used in pattern predicates that isel is matching.
- OptForSize = MF.getFunction().hasOptSize();
OptForMinSize = MF.getFunction().hasMinSize();
- assert((!OptForMinSize || OptForSize) &&
+ assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
"OptForMinSize implies OptForSize");
SelectionDAGISel::runOnMachineFunction(MF);
return true;
}
- void EmitFunctionEntryCode() override;
+ void emitFunctionEntryCode() override;
bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
@@ -221,9 +215,9 @@ namespace {
bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
- SDValue &Scale, SDValue &Index, SDValue &Disp,
- SDValue &Segment);
+ bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
+ SDValue ScaleOp, SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp, SDValue &Segment);
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
bool selectLEAAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -234,11 +228,6 @@ namespace {
bool selectTLSADDRAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
- SDValue &Base, SDValue &Scale,
- SDValue &Index, SDValue &Disp,
- SDValue &Segment,
- SDValue &NodeWithChain);
bool selectRelocImm(SDValue N, SDValue &Op);
bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
@@ -259,6 +248,8 @@ namespace {
SDValue &Index, SDValue &Disp,
SDValue &Segment);
+ bool isProfitableToFormMaskedOp(SDNode *N) const;
+
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
@@ -300,8 +291,8 @@ namespace {
MVT::i32, AM.Disp,
AM.SymbolFlags);
else if (AM.CP)
- Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
- AM.Align, AM.Disp, AM.SymbolFlags);
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
+ AM.Disp, AM.SymbolFlags);
else if (AM.ES) {
assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
@@ -368,9 +359,10 @@ namespace {
if (User->getNumOperands() != 2)
continue;
- // If this can match to INC/DEC, don't count it as a use.
- if (User->getOpcode() == ISD::ADD &&
- (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
+ // If this is a sign-extended 8-bit integer immediate used in an ALU
+ // instruction, there is probably an opcode encoding to save space.
+ auto *C = dyn_cast<ConstantSDNode>(N);
+ if (C && isInt<8>(C->getSExtValue()))
continue;
// Immediates that are used for offsets as part of stack
@@ -475,14 +467,6 @@ namespace {
bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
- /// Returns whether this is a relocatable immediate in the range
- /// [-2^Width .. 2^Width-1].
- template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
- if (auto *CN = dyn_cast<ConstantSDNode>(N))
- return isInt<Width>(CN->getSExtValue());
- return isSExtAbsoluteSymbolRef(Width, N);
- }
-
// Indicates we should prefer to use a non-temporal load for this load.
bool useNonTemporalLoad(LoadSDNode *N) const {
if (!N->isNonTemporal())
@@ -513,8 +497,8 @@ namespace {
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
- bool combineIncDecVector(SDNode *Node);
bool tryShrinkShlLogicImm(SDNode *N);
+ bool tryVPTERNLOG(SDNode *N);
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
bool tryMatchBitSelect(SDNode *N);
@@ -581,12 +565,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
if (!N.hasOneUse())
return false;
- // FIXME: Temporary hack to prevent strict floating point nodes from
- // folding into masked operations illegally.
- if (U == Root && Root->getOpcode() == ISD::VSELECT &&
- N.getOpcode() != ISD::LOAD && N.getOpcode() != X86ISD::VBROADCAST_LOAD)
- return false;
-
if (N.getOpcode() != ISD::LOAD)
return true;
@@ -650,6 +628,11 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
(-Imm->getAPIntValue()).isSignedIntN(8))
return false;
+
+ if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
+ (-Imm->getAPIntValue()).isSignedIntN(8) &&
+ hasNoCarryFlagUses(SDValue(U, 1)))
+ return false;
}
// If the other operand is a TLS address, we should fold it instead.
@@ -724,6 +707,20 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
return true;
}
+// Indicates it is profitable to form an AVX512 masked operation. Returning
+// false will favor a masked register-register masked move or vblendm and the
+// operation will be selected separately.
+bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
+ assert(
+ (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
+ "Unexpected opcode!");
+
+ // If the operation has additional users, the operation will be duplicated.
+ // Check the use count to prevent that.
+ // FIXME: Are there cheap opcodes we might want to duplicate?
+ return N->getOperand(1).hasOneUse();
+}
+
/// Replace the original chain operand of the call with
/// load's chain operand and move load below the call's chain operand.
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
@@ -799,6 +796,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
}
void X86DAGToDAGISel::PreprocessISelDAG() {
+ bool MadeChange = false;
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
@@ -811,11 +809,111 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
- CurDAG->DeleteNode(N);
+ MadeChange = true;
continue;
}
+ /// Convert vector increment or decrement to sub/add with an all-ones
+ /// constant:
+ /// add X, <1, 1...> --> sub X, <-1, -1...>
+ /// sub X, <1, 1...> --> add X, <-1, -1...>
+ /// The all-ones vector constant can be materialized using a pcmpeq
+ /// instruction that is commonly recognized as an idiom (has no register
+ /// dependency), so that's better/smaller than loading a splat 1 constant.
+ if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+ N->getSimpleValueType(0).isVector()) {
+
+ APInt SplatVal;
+ if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
+ SplatVal.isOneValue()) {
+ SDLoc DL(N);
+
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumElts = VT.getSizeInBits() / 32;
+ SDValue AllOnes =
+ CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
+ AllOnes = CurDAG->getBitcast(VT, AllOnes);
+
+ unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+ SDValue Res =
+ CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+ }
+
switch (N->getOpcode()) {
+ case X86ISD::VBROADCAST: {
+ MVT VT = N->getSimpleValueType(0);
+ // Emulate v32i16/v64i8 broadcast without BWI.
+ if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+ MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+ SDLoc dl(N);
+ SDValue NarrowBCast =
+ CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
+ SDValue Res =
+ CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+ NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+ unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+ Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+ CurDAG->getIntPtrConstant(Index, dl));
+
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+
+ break;
+ }
+ case X86ISD::VBROADCAST_LOAD: {
+ MVT VT = N->getSimpleValueType(0);
+ // Emulate v32i16/v64i8 broadcast without BWI.
+ if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+ MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+ auto *MemNode = cast<MemSDNode>(N);
+ SDLoc dl(N);
+ SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
+ SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
+ SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
+ MemNode->getMemOperand());
+ SDValue Res =
+ CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+ NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+ unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+ Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+ CurDAG->getIntPtrConstant(Index, dl));
+
+ --I;
+ SDValue To[] = {Res, NarrowBCast.getValue(1)};
+ CurDAG->ReplaceAllUsesWith(N, To);
+ ++I;
+ MadeChange = true;
+ continue;
+ }
+
+ break;
+ }
+ case ISD::VSELECT: {
+ // Replace VSELECT with non-mask conditions with with BLENDV.
+ if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+ break;
+
+ assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+ SDValue Blendv =
+ CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1), N->getOperand(2));
+ --I;
+ CurDAG->ReplaceAllUsesWith(N, Blendv.getNode());
+ ++I;
+ MadeChange = true;
+ continue;
+ }
case ISD::FP_ROUND:
case ISD::STRICT_FP_ROUND:
case ISD::FP_TO_SINT:
@@ -849,7 +947,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
--I;
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
++I;
- CurDAG->DeleteNode(N);
+ MadeChange = true;
continue;
}
case ISD::SHL:
@@ -872,27 +970,33 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
- CurDAG->DeleteNode(N);
+ MadeChange = true;
continue;
}
case ISD::ANY_EXTEND:
case ISD::ANY_EXTEND_VECTOR_INREG: {
// Replace vector any extend with the zero extend equivalents so we don't
// need 2 sets of patterns. Ignore vXi1 extensions.
- if (!N->getValueType(0).isVector() ||
- N->getOperand(0).getScalarValueSizeInBits() == 1)
+ if (!N->getValueType(0).isVector())
break;
- unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
- ? ISD::ZERO_EXTEND
- : ISD::ZERO_EXTEND_VECTOR_INREG;
+ unsigned NewOpc;
+ if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
+ assert(N->getOpcode() == ISD::ANY_EXTEND &&
+ "Unexpected opcode for mask vector!");
+ NewOpc = ISD::SIGN_EXTEND;
+ } else {
+ NewOpc = N->getOpcode() == ISD::ANY_EXTEND
+ ? ISD::ZERO_EXTEND
+ : ISD::ZERO_EXTEND_VECTOR_INREG;
+ }
SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
N->getOperand(0));
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
- CurDAG->DeleteNode(N);
+ MadeChange = true;
continue;
}
case ISD::FCEIL:
@@ -936,7 +1040,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
--I;
CurDAG->ReplaceAllUsesWith(N, Res.getNode());
++I;
- CurDAG->DeleteNode(N);
+ MadeChange = true;
continue;
}
case X86ISD::FANDN:
@@ -979,7 +1083,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
--I;
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
++I;
- CurDAG->DeleteNode(N);
+ MadeChange = true;
continue;
}
}
@@ -1018,6 +1122,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
continue;
moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
++NumLoadMoved;
+ MadeChange = true;
continue;
}
@@ -1064,14 +1169,17 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// operations. Based on this, decide what we want to do.
MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
SDLoc dl(N);
// FIXME: optimize the case where the src/dest is a load or store?
- SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
- MemTmp, MachinePointerInfo(), MemVT);
- SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
- MachinePointerInfo(), MemVT);
+ SDValue Store = CurDAG->getTruncStore(
+ CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
+ SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
+ MemTmp, MPI, MemVT);
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
// extload we created. This will cause general havok on the dag because
@@ -1117,6 +1225,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// operations. Based on this, decide what we want to do.
MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+ int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
SDLoc dl(N);
// FIXME: optimize the case where the src/dest is a load or store?
@@ -1127,7 +1238,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
SDVTList VTs = CurDAG->getVTList(MVT::Other);
SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
- MachinePointerInfo(), 0,
+ MPI, /*Align*/ None,
MachineMemOperand::MOStore);
if (N->getFlags().hasNoFPExcept()) {
SDNodeFlags Flags = Store->getFlags();
@@ -1137,15 +1248,15 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
} else {
assert(SrcVT == MemVT && "Unexpected VT!");
Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
- MachinePointerInfo());
+ MPI);
}
if (!DstIsSSE) {
SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
SDValue Ops[] = {Store, MemTmp};
- Result = CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT,
- MachinePointerInfo(), 0,
- MachineMemOperand::MOLoad);
+ Result = CurDAG->getMemIntrinsicNode(
+ X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
if (N->getFlags().hasNoFPExcept()) {
SDNodeFlags Flags = Result->getFlags();
Flags.setNoFPExcept(true);
@@ -1153,8 +1264,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
}
} else {
assert(DstVT == MemVT && "Unexpected VT!");
- Result =
- CurDAG->getLoad(DstVT, dl, Store, MemTmp, MachinePointerInfo());
+ Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
}
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
@@ -1171,13 +1281,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// Now that we did that, the node is dead. Increment the iterator to the
// next node to process, then delete N.
++I;
- CurDAG->DeleteNode(N);
+ MadeChange = true;
}
- // The load+call transform above can leave some dead nodes in the graph. Make
- // sure we remove them. Its possible some of the other transforms do to so
- // just remove dead nodes unconditionally.
- CurDAG->RemoveDeadNodes();
+ // Remove any dead nodes that may have been left behind.
+ if (MadeChange)
+ CurDAG->RemoveDeadNodes();
}
// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
@@ -1275,6 +1384,8 @@ void X86DAGToDAGISel::PostprocessISelDAG() {
And.getOperand(6) /* Chain */ };
MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
MVT::i32, MVT::Other, Ops);
+ CurDAG->setNodeMemRefs(
+ Test, cast<MachineSDNode>(And.getNode())->memoperands());
ReplaceUses(N, Test);
MadeChange = true;
continue;
@@ -1390,7 +1501,7 @@ void X86DAGToDAGISel::emitSpecialCodeForMain() {
}
}
-void X86DAGToDAGISel::EmitFunctionEntryCode() {
+void X86DAGToDAGISel::emitFunctionEntryCode() {
// If this is main, emit special code for main.
const Function &F = MF->getFunction();
if (F.hasExternalLinkage() && F.getName() == "main")
@@ -1409,18 +1520,20 @@ static bool isDispSafeForFrameIndex(int64_t Val) {
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
X86ISelAddressMode &AM) {
- // If there's no offset to fold, we don't need to do any work.
- if (Offset == 0)
- return false;
+ // We may have already matched a displacement and the caller just added the
+ // symbolic displacement. So we still need to do the checks even if Offset
+ // is zero.
+
+ int64_t Val = AM.Disp + Offset;
// Cannot combine ExternalSymbol displacements with integer offsets.
- if (AM.ES || AM.MCSym)
+ if (Val != 0 && (AM.ES || AM.MCSym))
return true;
- int64_t Val = AM.Disp + Offset;
CodeModel::Model M = TM.getCodeModel();
if (Subtarget->is64Bit()) {
- if (!X86::isOffsetSuitableForCodeModel(Val, M,
+ if (Val != 0 &&
+ !X86::isOffsetSuitableForCodeModel(Val, M,
AM.hasSymbolicDisplacement()))
return true;
// In addition to the checks required for a register base, check that
@@ -1449,13 +1562,13 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
(Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
Subtarget->isTargetFuchsia()))
switch (N->getPointerInfo().getAddrSpace()) {
- case 256:
+ case X86AS::GS:
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
return false;
- case 257:
+ case X86AS::FS:
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
return false;
- // Address space 258 is not handled here, because it is not used to
+ // Address space X86AS::SS is not handled here, because it is not used to
// address TLS areas.
}
@@ -1505,7 +1618,7 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
Offset = G->getOffset();
} else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
AM.CP = CP->getConstVal();
- AM.Align = CP->getAlignment();
+ AM.Alignment = CP->getAlign();
AM.SymbolFlags = CP->getTargetFlags();
Offset = CP->getOffset();
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
@@ -1583,9 +1696,10 @@ bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
return false;
AM = Backup;
- // Try again after commuting the operands.
- if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
- !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+ // Try again after commutating the operands.
+ if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
+ Depth + 1) &&
+ !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
return false;
AM = Backup;
@@ -1782,7 +1896,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
// There is nothing we can do here unless the mask is removing some bits.
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
- if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+ if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
// We also need to ensure that mask is a continuous run of bits.
if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
@@ -1877,7 +1991,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
// There is nothing we can do here unless the mask is removing some bits.
// Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
- if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+ if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
MVT VT = N.getSimpleValueType();
SDLoc DL(N);
@@ -2280,15 +2394,16 @@ bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
return matchAddressBase(N, AM);
}
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
- SDValue &Scale, SDValue &Index,
- SDValue &Disp, SDValue &Segment) {
+bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
+ SDValue IndexOp, SDValue ScaleOp,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
X86ISelAddressMode AM;
- auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
- AM.IndexReg = Mgs->getIndex();
- AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
+ AM.IndexReg = IndexOp;
+ AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
- unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+ unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
if (AddrSpace == X86AS::GS)
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
if (AddrSpace == X86AS::FS)
@@ -2296,11 +2411,11 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
if (AddrSpace == X86AS::SS)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
- SDLoc DL(N);
- MVT VT = N.getSimpleValueType();
+ SDLoc DL(BasePtr);
+ MVT VT = BasePtr.getSimpleValueType();
// Try to match into the base and displacement fields.
- if (matchVectorAddress(N, AM))
+ if (matchVectorAddress(BasePtr, AM))
return false;
getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
@@ -2331,12 +2446,11 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
unsigned AddrSpace =
cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
- // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
- if (AddrSpace == 256)
+ if (AddrSpace == X86AS::GS)
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
- if (AddrSpace == 257)
+ if (AddrSpace == X86AS::FS)
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
- if (AddrSpace == 258)
+ if (AddrSpace == X86AS::SS)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
}
@@ -2351,86 +2465,7 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
return true;
}
-// We can only fold a load if all nodes between it and the root node have a
-// single use. If there are additional uses, we could end up duplicating the
-// load.
-static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
- while (User != Root) {
- if (!User->hasOneUse())
- return false;
- User = *User->use_begin();
- }
-
- return true;
-}
-
-/// Match a scalar SSE load. In particular, we want to match a load whose top
-/// elements are either undef or zeros. The load flavor is derived from the
-/// type of N, which is either v4f32 or v2f64.
-///
-/// We also return:
-/// PatternChainNode: this is the matched node that has a chain input and
-/// output.
-bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
- SDValue N, SDValue &Base,
- SDValue &Scale, SDValue &Index,
- SDValue &Disp, SDValue &Segment,
- SDValue &PatternNodeWithChain) {
- if (!hasSingleUsesFromRoot(Root, Parent))
- return false;
-
- // We can allow a full vector load here since narrowing a load is ok unless
- // it's volatile or atomic.
- if (ISD::isNON_EXTLoad(N.getNode())) {
- LoadSDNode *LD = cast<LoadSDNode>(N);
- if (LD->isSimple() &&
- IsProfitableToFold(N, LD, Root) &&
- IsLegalToFold(N, Parent, Root, OptLevel)) {
- PatternNodeWithChain = N;
- return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
- Segment);
- }
- }
-
- // We can also match the special zero extended load opcode.
- if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
- PatternNodeWithChain = N;
- if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
- auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
- return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
- Segment);
- }
- }
-
- // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
- // once. Otherwise the load might get duplicated and the chain output of the
- // duplicate load will not be observed by all dependencies.
- if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
- PatternNodeWithChain = N.getOperand(0);
- if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
- IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
- LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
- return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
- Segment);
- }
- }
-
- return false;
-}
-
-
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
- if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
- uint64_t ImmVal = CN->getZExtValue();
- if (!isUInt<32>(ImmVal))
- return false;
-
- Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
- return true;
- }
-
// In static codegen with small code model, we can get the address of a label
// into a register with 'movl'
if (N->getOpcode() != X86ISD::Wrapper)
@@ -2604,12 +2639,6 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
}
bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
- if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
- Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
- N.getValueType());
- return true;
- }
-
// Keep track of the original value type and whether this value was
// truncated. If we see a truncation from pointer type to VT that truncates
// bits that are known to be zero, we can use a narrow reference.
@@ -3896,49 +3925,82 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
return true;
}
-/// Convert vector increment or decrement to sub/add with an all-ones constant:
-/// add X, <1, 1...> --> sub X, <-1, -1...>
-/// sub X, <1, 1...> --> add X, <-1, -1...>
-/// The all-ones vector constant can be materialized using a pcmpeq instruction
-/// that is commonly recognized as an idiom (has no register dependency), so
-/// that's better/smaller than loading a splat 1 constant.
-bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
- assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
- "Unexpected opcode for increment/decrement transform");
-
- EVT VT = Node->getValueType(0);
- assert(VT.isVector() && "Should only be called for vectors.");
-
- SDValue X = Node->getOperand(0);
- SDValue OneVec = Node->getOperand(1);
+// Try to match two logic ops to a VPTERNLOG.
+// FIXME: Handle inverted inputs?
+// FIXME: Handle more complex patterns that use an operand more than once?
+bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
+ MVT NVT = N->getSimpleValueType(0);
- APInt SplatVal;
- if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
+ // Make sure we support VPTERNLOG.
+ if (!NVT.isVector() || !Subtarget->hasAVX512() ||
+ NVT.getVectorElementType() == MVT::i1)
return false;
- SDLoc DL(Node);
- SDValue OneConstant, AllOnesVec;
+ // We need VLX for 128/256-bit.
+ if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+ return false;
- APInt Ones = APInt::getAllOnesValue(32);
- assert(VT.getSizeInBits() % 32 == 0 &&
- "Expected bit count to be a multiple of 32");
- OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
- insertDAGNode(*CurDAG, X, OneConstant);
+ unsigned Opc1 = N->getOpcode();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
- unsigned NumElts = VT.getSizeInBits() / 32;
- assert(NumElts > 0 && "Expected to get non-empty vector.");
- AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
- DL, OneConstant);
- insertDAGNode(*CurDAG, X, AllOnesVec);
+ auto isLogicOp = [](unsigned Opc) {
+ return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+ Opc == X86ISD::ANDNP;
+ };
- AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
- insertDAGNode(*CurDAG, X, AllOnesVec);
+ SDValue A, B, C;
+ unsigned Opc2;
+ if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) {
+ Opc2 = N1.getOpcode();
+ A = N0;
+ B = N1.getOperand(0);
+ C = N1.getOperand(1);
+ } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) {
+ Opc2 = N0.getOpcode();
+ A = N1;
+ B = N0.getOperand(0);
+ C = N0.getOperand(1);
+ } else
+ return false;
- unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
- SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
+ uint64_t Imm;
+ switch (Opc1) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND:
+ switch (Opc2) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND: Imm = 0x80; break;
+ case ISD::OR: Imm = 0xe0; break;
+ case ISD::XOR: Imm = 0x60; break;
+ case X86ISD::ANDNP: Imm = 0x20; break;
+ }
+ break;
+ case ISD::OR:
+ switch (Opc2) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND: Imm = 0xf8; break;
+ case ISD::OR: Imm = 0xfe; break;
+ case ISD::XOR: Imm = 0xf6; break;
+ case X86ISD::ANDNP: Imm = 0xf2; break;
+ }
+ break;
+ case ISD::XOR:
+ switch (Opc2) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND: Imm = 0x78; break;
+ case ISD::OR: Imm = 0x1e; break;
+ case ISD::XOR: Imm = 0x96; break;
+ case X86ISD::ANDNP: Imm = 0xd2; break;
+ }
+ break;
+ }
- ReplaceNode(Node, NewNode.getNode());
- SelectCode(NewNode.getNode());
+ SDLoc DL(N);
+ SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C,
+ CurDAG->getTargetConstant(Imm, DL, MVT::i8));
+ ReplaceNode(N, New.getNode());
+ SelectCode(New.getNode());
return true;
}
@@ -4014,159 +4076,50 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
bool FoldedBCast, bool Masked) {
- if (Masked) {
- if (FoldedLoad) {
- switch (TestVT.SimpleTy) {
- default: llvm_unreachable("Unexpected VT!");
- case MVT::v16i8:
- return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
- case MVT::v8i16:
- return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
- case MVT::v4i32:
- return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
- case MVT::v2i64:
- return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
- case MVT::v32i8:
- return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
- case MVT::v16i16:
- return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
- case MVT::v8i32:
- return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
- case MVT::v4i64:
- return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
- case MVT::v64i8:
- return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
- case MVT::v32i16:
- return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
- case MVT::v16i32:
- return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
- case MVT::v8i64:
- return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
- }
- }
-
- if (FoldedBCast) {
- switch (TestVT.SimpleTy) {
- default: llvm_unreachable("Unexpected VT!");
- case MVT::v4i32:
- return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
- case MVT::v2i64:
- return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
- case MVT::v8i32:
- return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
- case MVT::v4i64:
- return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
- case MVT::v16i32:
- return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
- case MVT::v8i64:
- return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
- }
- }
-
- switch (TestVT.SimpleTy) {
- default: llvm_unreachable("Unexpected VT!");
- case MVT::v16i8:
- return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
- case MVT::v8i16:
- return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
- case MVT::v4i32:
- return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
- case MVT::v2i64:
- return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
- case MVT::v32i8:
- return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
- case MVT::v16i16:
- return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
- case MVT::v8i32:
- return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
- case MVT::v4i64:
- return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
- case MVT::v64i8:
- return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
- case MVT::v32i16:
- return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
- case MVT::v16i32:
- return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
- case MVT::v8i64:
- return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
- }
- }
+#define VPTESTM_CASE(VT, SUFFIX) \
+case MVT::VT: \
+ if (Masked) \
+ return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
+ return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
+
+
+#define VPTESTM_BROADCAST_CASES(SUFFIX) \
+default: llvm_unreachable("Unexpected VT!"); \
+VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
+VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
+VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
+VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
+VPTESTM_CASE(v16i32, DZ##SUFFIX) \
+VPTESTM_CASE(v8i64, QZ##SUFFIX)
+
+#define VPTESTM_FULL_CASES(SUFFIX) \
+VPTESTM_BROADCAST_CASES(SUFFIX) \
+VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
+VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
+VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
+VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
+VPTESTM_CASE(v64i8, BZ##SUFFIX) \
+VPTESTM_CASE(v32i16, WZ##SUFFIX)
if (FoldedLoad) {
switch (TestVT.SimpleTy) {
- default: llvm_unreachable("Unexpected VT!");
- case MVT::v16i8:
- return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
- case MVT::v8i16:
- return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
- case MVT::v4i32:
- return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
- case MVT::v2i64:
- return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
- case MVT::v32i8:
- return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
- case MVT::v16i16:
- return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
- case MVT::v8i32:
- return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
- case MVT::v4i64:
- return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
- case MVT::v64i8:
- return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
- case MVT::v32i16:
- return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
- case MVT::v16i32:
- return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
- case MVT::v8i64:
- return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
+ VPTESTM_FULL_CASES(rm)
}
}
if (FoldedBCast) {
switch (TestVT.SimpleTy) {
- default: llvm_unreachable("Unexpected VT!");
- case MVT::v4i32:
- return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
- case MVT::v2i64:
- return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
- case MVT::v8i32:
- return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
- case MVT::v4i64:
- return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
- case MVT::v16i32:
- return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
- case MVT::v8i64:
- return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
+ VPTESTM_BROADCAST_CASES(rmb)
}
}
switch (TestVT.SimpleTy) {
- default: llvm_unreachable("Unexpected VT!");
- case MVT::v16i8:
- return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
- case MVT::v8i16:
- return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
- case MVT::v4i32:
- return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
- case MVT::v2i64:
- return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
- case MVT::v32i8:
- return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
- case MVT::v16i16:
- return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
- case MVT::v8i32:
- return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
- case MVT::v4i64:
- return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
- case MVT::v64i8:
- return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
- case MVT::v32i16:
- return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
- case MVT::v16i32:
- return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
- case MVT::v8i64:
- return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
+ VPTESTM_FULL_CASES(rr)
}
+
+#undef VPTESTM_FULL_CASES
+#undef VPTESTM_BROADCAST_CASES
+#undef VPTESTM_CASE
}
// Try to create VPTESTM instruction. If InMask is not null, it will be used
@@ -4477,8 +4430,39 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
+ case Intrinsic::x86_tileloadd64:
+ case Intrinsic::x86_tileloaddt164:
+ case Intrinsic::x86_tilestored64: {
+ if (!Subtarget->hasAMXTILE())
+ break;
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break;
+ case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
+ case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break;
+ }
+ // FIXME: Match displacement and scale.
+ unsigned TIndex = Node->getConstantOperandVal(2);
+ SDValue TReg = getI8Imm(TIndex, dl);
+ SDValue Base = Node->getOperand(3);
+ SDValue Scale = getI8Imm(1, dl);
+ SDValue Index = Node->getOperand(4);
+ SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+ SDValue Chain = Node->getOperand(0);
+ MachineSDNode *CNode;
+ if (Opc == X86::PTILESTORED) {
+ SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ } else {
+ SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
+ CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+ }
+ ReplaceNode(Node, CNode);
+ return;
+ }
}
-
break;
}
case ISD::BRIND: {
@@ -4490,9 +4474,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Converts a 32-bit register to a 64-bit, zero-extended version of
// it. This is needed because x86-64 can do many things, but jmp %r32
// ain't one of them.
- const SDValue &Target = Node->getOperand(1);
- assert(Target.getSimpleValueType() == llvm::MVT::i32);
- SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+ SDValue Target = Node->getOperand(1);
+ assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
+ SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
Node->getOperand(0), ZextTarget);
ReplaceNode(Node, Brind.getNode());
@@ -4516,21 +4500,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
}
break;
- case ISD::VSELECT: {
- // Replace VSELECT with non-mask conditions with with BLENDV.
- if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
- break;
-
- assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
- SDValue Blendv = CurDAG->getNode(
- X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
- Node->getOperand(1), Node->getOperand(2));
- ReplaceNode(Node, Blendv.getNode());
- SelectCode(Blendv.getNode());
- // We already called ReplaceUses.
- return;
- }
-
case ISD::SRL:
if (matchBitExtract(Node))
return;
@@ -4569,24 +4538,21 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case ISD::XOR:
if (tryShrinkShlLogicImm(Node))
return;
-
if (Opcode == ISD::OR && tryMatchBitSelect(Node))
return;
+ if (tryVPTERNLOG(Node))
+ return;
LLVM_FALLTHROUGH;
case ISD::ADD:
case ISD::SUB: {
- if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
- combineIncDecVector(Node))
- return;
-
// Try to avoid folding immediates with multiple uses for optsize.
// This code tries to select to register form directly to avoid going
// through the isel table which might fold the immediate. We can't change
// the patterns on the add/sub/and/or/xor with immediate paterns in the
// tablegen files to check immediate use count without making the patterns
// unavailable to the fast-isel table.
- if (!OptForSize)
+ if (!CurDAG->shouldOptForSize())
break;
// Only handle i8/i16/i32/i64.
@@ -4720,7 +4686,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
- // Multiply is commmutative.
+ // Multiply is commutative.
if (!FoldedLoad) {
FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
if (FoldedLoad)
@@ -4772,31 +4738,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N1 = Node->getOperand(1);
unsigned Opc, MOpc;
- bool isSigned = Opcode == ISD::SMUL_LOHI;
- if (!isSigned) {
- switch (NVT.SimpleTy) {
- default: llvm_unreachable("Unsupported VT!");
- case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
- case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
- }
- } else {
- switch (NVT.SimpleTy) {
- default: llvm_unreachable("Unsupported VT!");
- case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
- case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
- }
- }
-
- unsigned SrcReg, LoReg, HiReg;
- switch (Opc) {
- default: llvm_unreachable("Unknown MUL opcode!");
- case X86::IMUL32r:
- case X86::MUL32r:
- SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+ unsigned LoReg, HiReg;
+ bool IsSigned = Opcode == ISD::SMUL_LOHI;
+ bool UseMULX = !IsSigned && Subtarget->hasBMI2();
+ bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
+ switch (NVT.SimpleTy) {
+ default: llvm_unreachable("Unsupported VT!");
+ case MVT::i32:
+ Opc = UseMULXHi ? X86::MULX32Hrr :
+ UseMULX ? X86::MULX32rr :
+ IsSigned ? X86::IMUL32r : X86::MUL32r;
+ MOpc = UseMULXHi ? X86::MULX32Hrm :
+ UseMULX ? X86::MULX32rm :
+ IsSigned ? X86::IMUL32m : X86::MUL32m;
+ LoReg = UseMULX ? X86::EDX : X86::EAX;
+ HiReg = X86::EDX;
break;
- case X86::IMUL64r:
- case X86::MUL64r:
- SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+ case MVT::i64:
+ Opc = UseMULXHi ? X86::MULX64Hrr :
+ UseMULX ? X86::MULX64rr :
+ IsSigned ? X86::IMUL64r : X86::MUL64r;
+ MOpc = UseMULXHi ? X86::MULX64Hrm :
+ UseMULX ? X86::MULX64rm :
+ IsSigned ? X86::IMUL64m : X86::MUL64m;
+ LoReg = UseMULX ? X86::RDX : X86::RAX;
+ HiReg = X86::RDX;
break;
}
@@ -4809,17 +4775,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
std::swap(N0, N1);
}
- SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+ SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
N0, SDValue()).getValue(1);
+ SDValue ResHi, ResLo;
if (foldedLoad) {
SDValue Chain;
MachineSDNode *CNode = nullptr;
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
- SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
- CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
- Chain = SDValue(CNode, 0);
- InFlag = SDValue(CNode, 1);
+ if (UseMULXHi) {
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ Chain = SDValue(CNode, 1);
+ } else if (UseMULX) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ Chain = SDValue(CNode, 2);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+ CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ Chain = SDValue(CNode, 0);
+ InFlag = SDValue(CNode, 1);
+ }
// Update the chain.
ReplaceUses(N1.getValue(1), Chain);
@@ -4827,27 +4807,42 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
SDValue Ops[] = { N1, InFlag };
- SDVTList VTs = CurDAG->getVTList(MVT::Glue);
- SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
- InFlag = SDValue(CNode, 0);
+ if (UseMULXHi) {
+ SDVTList VTs = CurDAG->getVTList(NVT);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ } else if (UseMULX) {
+ SDVTList VTs = CurDAG->getVTList(NVT, NVT);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ ResHi = SDValue(CNode, 0);
+ ResLo = SDValue(CNode, 1);
+ } else {
+ SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+ SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ InFlag = SDValue(CNode, 0);
+ }
}
// Copy the low half of the result, if it is needed.
if (!SDValue(Node, 0).use_empty()) {
- assert(LoReg && "Register for low half is not defined!");
- SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
- NVT, InFlag);
- InFlag = ResLo.getValue(2);
+ if (!ResLo) {
+ assert(LoReg && "Register for low half is not defined!");
+ ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
+ NVT, InFlag);
+ InFlag = ResLo.getValue(2);
+ }
ReplaceUses(SDValue(Node, 0), ResLo);
LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
dbgs() << '\n');
}
// Copy the high half of the result, if it is needed.
if (!SDValue(Node, 1).use_empty()) {
- assert(HiReg && "Register for high half is not defined!");
- SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
- NVT, InFlag);
- InFlag = ResHi.getValue(2);
+ if (!ResHi) {
+ assert(HiReg && "Register for high half is not defined!");
+ ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
+ NVT, InFlag);
+ InFlag = ResHi.getValue(2);
+ }
ReplaceUses(SDValue(Node, 1), ResHi);
LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
dbgs() << '\n');
@@ -4862,23 +4857,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
- unsigned Opc, MOpc;
+ unsigned ROpc, MOpc;
bool isSigned = Opcode == ISD::SDIVREM;
if (!isSigned) {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
- case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
- case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
- case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
- case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
}
} else {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
- case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
- case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
- case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
- case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
}
}
@@ -4943,7 +4938,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
} else {
// Zero out the high part, effectively zero extending the input.
- SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+ SDValue ClrNode =
+ SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
switch (NVT.SimpleTy) {
case MVT::i16:
ClrNode =
@@ -4985,7 +4982,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
} else {
InFlag =
- SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+ SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0);
}
// Prevent use of AH in a REX instruction by explicitly copying it to
@@ -5034,6 +5031,77 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
return;
}
+ case X86ISD::FCMP:
+ case X86ISD::STRICT_FCMP:
+ case X86ISD::STRICT_FCMPS: {
+ bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
+ Node->getOpcode() == X86ISD::STRICT_FCMPS;
+ SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
+ SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
+
+ // Save the original VT of the compare.
+ MVT CmpVT = N0.getSimpleValueType();
+
+ // Floating point needs special handling if we don't have FCOMI.
+ if (Subtarget->hasCMov())
+ break;
+
+ bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
+
+ unsigned Opc;
+ switch (CmpVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected type!");
+ case MVT::f32:
+ Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
+ break;
+ case MVT::f64:
+ Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
+ break;
+ case MVT::f80:
+ Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
+ break;
+ }
+
+ SDValue Cmp;
+ SDValue Chain =
+ IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
+ if (IsStrictCmp) {
+ SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other);
+ Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
+ Chain = Cmp.getValue(1);
+ } else {
+ Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0);
+ }
+
+ // Move FPSW to AX.
+ SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue());
+ Chain = FPSW;
+ SDValue FNSTSW =
+ SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW,
+ FPSW.getValue(1)),
+ 0);
+
+ // Extract upper 8-bits of AX.
+ SDValue Extract =
+ CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
+
+ // Move AH into flags.
+ // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+ assert(Subtarget->hasLAHFSAHF() &&
+ "Target doesn't support SAHF or FCOMI?");
+ SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
+ Chain = AH;
+ SDValue SAHF = SDValue(
+ CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
+
+ if (IsStrictCmp)
+ ReplaceUses(SDValue(Node, 1), Chain);
+
+ ReplaceUses(SDValue(Node, 0), SAHF);
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+
case X86ISD::CMP: {
SDValue N0 = Node->getOperand(0);
SDValue N1 = Node->getOperand(1);
@@ -5267,6 +5335,279 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (foldLoadStoreIntoMemOperand(Node))
return;
break;
+
+ case X86ISD::SETCC_CARRY: {
+ // We have to do this manually because tblgen will put the eflags copy in
+ // the wrong place if we use an extract_subreg in the pattern.
+ MVT VT = Node->getSimpleValueType(0);
+
+ // Copy flags to the EFLAGS register and glue it to next node.
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ Node->getOperand(1), SDValue());
+
+ // Create a 64-bit instruction if the result is 64-bits otherwise use the
+ // 32-bit version.
+ unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+ MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+ SDValue Result = SDValue(
+ CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+
+ // For less than 32-bits we need to extract from the 32-bit node.
+ if (VT == MVT::i8 || VT == MVT::i16) {
+ int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
+ Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
+ }
+
+ ReplaceUses(SDValue(Node, 0), Result);
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::SBB: {
+ if (isNullConstant(Node->getOperand(0)) &&
+ isNullConstant(Node->getOperand(1))) {
+ MVT VT = Node->getSimpleValueType(0);
+
+ // Create zero.
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+ SDValue Zero =
+ SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
+ if (VT == MVT::i64) {
+ Zero = SDValue(
+ CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
+ 0);
+ }
+
+ // Copy flags to the EFLAGS register and glue it to next node.
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ Node->getOperand(2), SDValue());
+
+ // Create a 64-bit instruction if the result is 64-bits otherwise use the
+ // 32-bit version.
+ unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
+ MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+ VTs = CurDAG->getVTList(SBBVT, MVT::i32);
+ SDValue Result =
+ SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS,
+ EFLAGS.getValue(1)}),
+ 0);
+
+ // Replace the flag use.
+ ReplaceUses(SDValue(Node, 1), Result.getValue(1));
+
+ // Replace the result use.
+ if (!SDValue(Node, 0).use_empty()) {
+ // For less than 32-bits we need to extract from the 32-bit node.
+ if (VT == MVT::i8 || VT == MVT::i16) {
+ int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
+ Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
+ }
+ ReplaceUses(SDValue(Node, 0), Result);
+ }
+
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ break;
+ }
+ case X86ISD::MGATHER: {
+ auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
+ SDValue IndexOp = Mgt->getIndex();
+ SDValue Mask = Mgt->getMask();
+ MVT IndexVT = IndexOp.getSimpleValueType();
+ MVT ValueVT = Node->getSimpleValueType(0);
+ MVT MaskVT = Mask.getSimpleValueType();
+
+ // This is just to prevent crashes if the nodes are malformed somehow. We're
+ // otherwise only doing loose type checking in here based on type what
+ // a type constraint would say just like table based isel.
+ if (!ValueVT.isVector() || !MaskVT.isVector())
+ break;
+
+ unsigned NumElts = ValueVT.getVectorNumElements();
+ MVT ValueSVT = ValueVT.getVectorElementType();
+
+ bool IsFP = ValueSVT.isFloatingPoint();
+ unsigned EltSize = ValueSVT.getSizeInBits();
+
+ unsigned Opc = 0;
+ bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
+ if (AVX512Gather) {
+ if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
+ else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
+ } else {
+ assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
+ "Unexpected mask VT!");
+ if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
+ else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
+ else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
+ }
+
+ if (!Opc)
+ break;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
+ Base, Scale, Index, Disp, Segment))
+ break;
+
+ SDValue PassThru = Mgt->getPassThru();
+ SDValue Chain = Mgt->getChain();
+ // Gather instructions have a mask output not in the ISD node.
+ SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
+
+ MachineSDNode *NewNode;
+ if (AVX512Gather) {
+ SDValue Ops[] = {PassThru, Mask, Base, Scale,
+ Index, Disp, Segment, Chain};
+ NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+ } else {
+ SDValue Ops[] = {PassThru, Base, Scale, Index,
+ Disp, Segment, Mask, Chain};
+ NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+ }
+ CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+ ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case X86ISD::MSCATTER: {
+ auto *Sc = cast<X86MaskedScatterSDNode>(Node);
+ SDValue Value = Sc->getValue();
+ SDValue IndexOp = Sc->getIndex();
+ MVT IndexVT = IndexOp.getSimpleValueType();
+ MVT ValueVT = Value.getSimpleValueType();
+
+ // This is just to prevent crashes if the nodes are malformed somehow. We're
+ // otherwise only doing loose type checking in here based on type what
+ // a type constraint would say just like table based isel.
+ if (!ValueVT.isVector())
+ break;
+
+ unsigned NumElts = ValueVT.getVectorNumElements();
+ MVT ValueSVT = ValueVT.getVectorElementType();
+
+ bool IsFP = ValueSVT.isFloatingPoint();
+ unsigned EltSize = ValueSVT.getSizeInBits();
+
+ unsigned Opc;
+ if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
+ else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
+ else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
+ else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
+ else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
+ else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+ Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
+ else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
+ else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
+ else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+ Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
+ else
+ break;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
+ Base, Scale, Index, Disp, Segment))
+ break;
+
+ SDValue Mask = Sc->getMask();
+ SDValue Chain = Sc->getChain();
+ // Scatter instructions have a mask output not in the ISD node.
+ SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
+ SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
+
+ MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+ CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
+ ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case ISD::PREALLOCATED_SETUP: {
+ auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ auto CallId = MFI->getPreallocatedIdForCallSite(
+ cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+ SDValue Chain = Node->getOperand(0);
+ SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+ MachineSDNode *New = CurDAG->getMachineNode(
+ TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
+ ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
+ case ISD::PREALLOCATED_ARG: {
+ auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ auto CallId = MFI->getPreallocatedIdForCallSite(
+ cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+ SDValue Chain = Node->getOperand(0);
+ SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+ SDValue ArgIndex = Node->getOperand(2);
+ SDValue Ops[3];
+ Ops[0] = CallIdValue;
+ Ops[1] = ArgIndex;
+ Ops[2] = Chain;
+ MachineSDNode *New = CurDAG->getMachineNode(
+ TargetOpcode::PREALLOCATED_ARG, dl,
+ CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
+ MVT::Other),
+ Ops);
+ ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
+ ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
+ CurDAG->RemoveDeadNode(Node);
+ return;
+ }
}
SelectCode(Node);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
index c8720d9ae3a6..450927aaf5cc 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12,7 +12,8 @@
//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
-#include "Utils/X86ShuffleDecode.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
+#include "X86.h"
#include "X86CallingConv.h"
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
@@ -28,6 +29,7 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -37,7 +39,6 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
@@ -75,13 +76,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
" of the loop header PC will be 0)."),
cl::Hidden);
-// Added in 10.0.
-static cl::opt<bool> EnableOldKNLABI(
- "x86-enable-old-knl-abi", cl::init(false),
- cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
- "one ZMM register on AVX512F, but not AVX512BW targets."),
- cl::Hidden);
-
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -164,7 +158,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
- // FIXME: Should we be limitting the atomic size on other configs? Default is
+ // FIXME: Should we be limiting the atomic size on other configs? Default is
// 1024.
if (!Subtarget.hasCmpxchg8b())
setMaxAtomicSizeInBitsSupported(32);
@@ -190,12 +184,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
// SETOEQ and SETUNE require checking two conditions.
- setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
- setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
- setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
- setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
- setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
- setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+ for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
+ setCondCodeAction(ISD::SETOEQ, VT, Expand);
+ setCondCodeAction(ISD::SETUNE, VT, Expand);
+ }
// Integer absolute.
if (Subtarget.hasCMov()) {
@@ -206,10 +198,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Funnel shifts.
for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
+ // For slow shld targets we only lower for code size.
+ LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
+
+ setOperationAction(ShiftOp , MVT::i8 , Custom);
setOperationAction(ShiftOp , MVT::i16 , Custom);
- setOperationAction(ShiftOp , MVT::i32 , Custom);
+ setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
if (Subtarget.is64Bit())
- setOperationAction(ShiftOp , MVT::i64 , Custom);
+ setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
}
if (!Subtarget.useSoftFloat()) {
@@ -270,6 +266,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+
+ setOperationAction(ISD::LRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LRINT, MVT::f64, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f32, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f64, Custom);
+
+ if (!Subtarget.is64Bit()) {
+ setOperationAction(ISD::LRINT, MVT::i64, Custom);
+ setOperationAction(ISD::LLRINT, MVT::i64, Custom);
+ }
}
// Handle address space casts between mixed sized pointers.
@@ -347,34 +353,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
} else {
- setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
- setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
- if (Subtarget.is64Bit()) {
- setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+ if (VT == MVT::i64 && !Subtarget.is64Bit())
+ continue;
+ setOperationAction(ISD::CTLZ , VT, Custom);
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
}
}
- // Special handling for half-precision floating point conversions.
- // If we don't have F16C support, then lower half float conversions
- // into library calls.
- if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
- setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+ for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
+ ISD::STRICT_FP_TO_FP16}) {
+ // Special handling for half-precision floating point conversions.
+ // If we don't have F16C support, then lower half float conversions
+ // into library calls.
+ setOperationAction(
+ Op, MVT::f32,
+ (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
+ // There's never any support for operations beyond MVT::f32.
+ setOperationAction(Op, MVT::f64, Expand);
+ setOperationAction(Op, MVT::f80, Expand);
+ setOperationAction(Op, MVT::f128, Expand);
}
- // There's never any support for operations beyond MVT::f32.
- setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
- setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
- setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
- setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
-
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
@@ -542,7 +542,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
- } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
+ } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
+ (UseX87 || Is64Bit)) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
addRegisterClass(MVT::f32, &X86::FR32RegClass);
@@ -663,8 +664,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f80, Expand);
setOperationAction(ISD::LROUND, MVT::f80, Expand);
setOperationAction(ISD::LLROUND, MVT::f80, Expand);
- setOperationAction(ISD::LRINT, MVT::f80, Expand);
- setOperationAction(ISD::LLRINT, MVT::f80, Expand);
+ setOperationAction(ISD::LRINT, MVT::f80, Custom);
+ setOperationAction(ISD::LLRINT, MVT::f80, Custom);
// Handle constrained floating-point operations of scalar.
setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
@@ -1038,8 +1039,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
- // With AVX512, expanding (and promoting the shifts) is better.
- if (!Subtarget.hasAVX512())
+ // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
+ // shifts) is better.
+ if (!Subtarget.useAVX512Regs() &&
+ !(Subtarget.hasBWI() && Subtarget.hasVLX()))
setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
@@ -1078,6 +1081,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
+
+ setOperationAction(ISD::FROUND, RoundedTy, Custom);
}
setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
@@ -1170,6 +1175,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_FRINT, VT, Legal);
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+
+ setOperationAction(ISD::FROUND, VT, Custom);
+
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
@@ -1221,7 +1229,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
// With BWI, expanding (and promoting the shifts) is the better.
- if (!Subtarget.hasBWI())
+ if (!Subtarget.useBWIRegs())
setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
@@ -1412,19 +1420,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ANY_EXTEND, VT, Custom);
}
- for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::ADD, VT, Custom);
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::UADDSAT, VT, Custom);
+ setOperationAction(ISD::SADDSAT, VT, Custom);
+ setOperationAction(ISD::USUBSAT, VT, Custom);
+ setOperationAction(ISD::SSUBSAT, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ }
+
+ for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
- setOperationAction(ISD::UADDSAT, VT, Custom);
- setOperationAction(ISD::SADDSAT, VT, Custom);
- setOperationAction(ISD::USUBSAT, VT, Custom);
- setOperationAction(ISD::SSUBSAT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
@@ -1432,7 +1444,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Expand);
}
for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
@@ -1443,10 +1454,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// elements. 512-bits can be disabled based on prefer-vector-width and
// required-vector-width function attributes.
if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
+ bool HasBWI = Subtarget.hasBWI();
+
addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
+ addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+ addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
@@ -1454,6 +1469,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
+ if (HasBWI)
+ setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
}
for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
@@ -1497,6 +1514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
+ if (HasBWI)
+ setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
// to 512-bit rather than use the AVX2 instructions so that we can use
@@ -1509,19 +1528,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}
- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- // Need to custom widen this if we don't have AVX512BW.
- setOperationAction(ISD::ANY_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v8i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
+ if (HasBWI) {
+ // Extends from v64i1 masks to 512-bit vectors.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
+ }
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
@@ -1535,47 +1561,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, VT, Legal);
setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
- setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::FROUND, VT, Custom);
}
- // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
- for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
+ for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
+ setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MUL, MVT::v64i8, Custom);
+
+ setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+ setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
- setOperationAction(ISD::MUL, MVT::v8i64, Custom);
- setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
- setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+ setOperationAction(ISD::SRL, VT, Custom);
+ setOperationAction(ISD::SHL, VT, Custom);
+ setOperationAction(ISD::SRA, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+ // setcc all the way to isel and prefer SETGT in some isel patterns.
+ setCondCodeAction(ISD::SETLT, VT, Custom);
+ setCondCodeAction(ISD::SETLE, VT, Custom);
+ }
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
+ }
- // The condition codes aren't legal in SSE/AVX and under AVX512 we use
- // setcc all the way to isel and prefer SETGT in some isel patterns.
- setCondCodeAction(ISD::SETLT, VT, Custom);
- setCondCodeAction(ISD::SETLE, VT, Custom);
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
+ setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
+ setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
}
if (Subtarget.hasDQI()) {
@@ -1610,36 +1658,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+ for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+ MVT::v16f32, MVT::v8f64 }) {
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ }
+
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
- if (!Subtarget.hasBWI()) {
- // Need to custom split v32i16/v64i8 bitcasts.
- setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
- setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
-
- // Better to split these into two 256-bit ops.
- setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
+ if (HasBWI) {
+ for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+ setOperationAction(ISD::MLOAD, VT, Legal);
+ setOperationAction(ISD::MSTORE, VT, Legal);
+ }
+ } else {
+ setOperationAction(ISD::STORE, MVT::v32i16, Custom);
+ setOperationAction(ISD::STORE, MVT::v64i8, Custom);
}
if (Subtarget.hasVBMI2()) {
- for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+ for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}
}
- }// has AVX-512
+ }// useAVX512Regs
// This block controls legalization for operations that don't have
// pre-AVX512 equivalents. Without VLX we use 512-bit operations for
@@ -1667,6 +1721,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
Subtarget.hasVLX() ? Legal : Custom);
+ if (Subtarget.hasDQI()) {
+ // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
+ // v2f32 UINT_TO_FP is already custom under SSE2.
+ assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
+ "Unexpected operation action!");
+ // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
+ }
+
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -1746,12 +1813,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
}
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
for (auto VT : { MVT::v16i1, MVT::v32i1 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
@@ -1759,93 +1824,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
- }
-
- // This block controls legalization for v32i16 and v64i8. 512-bits can be
- // disabled based on prefer-vector-width and required-vector-width function
- // attributes.
- if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
- addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
- addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
-
- // Extends from v64i1 masks to 512-bit vectors.
- setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
-
- setOperationAction(ISD::MUL, MVT::v32i16, Legal);
- setOperationAction(ISD::MUL, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
- setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
- setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
- setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
- setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
- setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
- setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
-
- setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
-
- for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Custom);
- setOperationAction(ISD::ABS, VT, Legal);
- setOperationAction(ISD::SRL, VT, Custom);
- setOperationAction(ISD::SHL, VT, Custom);
- setOperationAction(ISD::SRA, VT, Custom);
- setOperationAction(ISD::MLOAD, VT, Legal);
- setOperationAction(ISD::MSTORE, VT, Legal);
- setOperationAction(ISD::CTPOP, VT, Custom);
- setOperationAction(ISD::CTLZ, VT, Custom);
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::UADDSAT, VT, Legal);
- setOperationAction(ISD::SADDSAT, VT, Legal);
- setOperationAction(ISD::USUBSAT, VT, Legal);
- setOperationAction(ISD::SSUBSAT, VT, Legal);
- setOperationAction(ISD::SELECT, VT, Custom);
-
- // The condition codes aren't legal in SSE/AVX and under AVX512 we use
- // setcc all the way to isel and prefer SETGT in some isel patterns.
- setCondCodeAction(ISD::SETLT, VT, Custom);
- setCondCodeAction(ISD::SETLE, VT, Custom);
- }
-
- for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
- setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
- }
-
- if (Subtarget.hasBITALG()) {
- for (auto VT : { MVT::v64i8, MVT::v32i16 })
- setOperationAction(ISD::CTPOP, VT, Legal);
- }
-
- if (Subtarget.hasVBMI2()) {
- setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
- setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
- }
- }
- if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
@@ -1874,19 +1853,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
- if (Subtarget.hasDQI()) {
- // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
- // v2f32 UINT_TO_FP is already custom under SSE2.
- assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
- isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
- "Unexpected operation action!");
- // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
- }
-
if (Subtarget.hasBWI()) {
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
@@ -1983,6 +1949,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
@@ -2000,6 +1967,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA);
+ setTargetDAGCombine(ISD::STRICT_FMA);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB);
@@ -2024,6 +1992,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::MSCATTER);
setTargetDAGCombine(ISD::MGATHER);
+ setTargetDAGCombine(ISD::FP16_TO_FP);
+ setTargetDAGCombine(ISD::FP_EXTEND);
+ setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
+ setTargetDAGCombine(ISD::FP_ROUND);
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -2075,7 +2047,8 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(MVT VT) const {
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+ if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
+ !Subtarget.hasBWI())
return TypeSplitVector;
if (VT.getVectorNumElements() != 1 &&
@@ -2085,51 +2058,73 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+static std::pair<MVT, unsigned>
+handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
+ const X86Subtarget &Subtarget) {
+ // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
+ // convention is one that uses k registers.
+ if (NumElts == 2)
+ return {MVT::v2i64, 1};
+ if (NumElts == 4)
+ return {MVT::v4i32, 1};
+ if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v8i16, 1};
+ if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
+ CC != CallingConv::Intel_OCL_BI)
+ return {MVT::v16i8, 1};
+ // v32i1 passes in ymm unless we have BWI and the calling convention is
+ // regcall.
+ if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
+ return {MVT::v32i8, 1};
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
+ if (Subtarget.useAVX512Regs())
+ return {MVT::v64i8, 1};
+ return {MVT::v32i8, 2};
+ }
+
+ // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+ if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
+ NumElts > 64)
+ return {MVT::i8, NumElts};
+
+ return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
+}
+
MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // v32i1 vectors should be promoted to v32i8 to match avx2.
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
- return MVT::v32i8;
- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512() &&
- (!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
- return MVT::i8;
- // Split v64i1 vectors if we don't have v64i8 available.
- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
- CC != CallingConv::X86_RegCall)
- return MVT::v32i1;
- // FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
- return MVT::v16i32;
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return RegisterVT;
+ }
+
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // v32i1 vectors should be promoted to v32i8 to match avx2.
- if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
- return 1;
- // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
- Subtarget.hasAVX512() &&
- (!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
- return VT.getVectorNumElements();
- // Split v64i1 vectors if we don't have v64i8 available.
- if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
- CC != CallingConv::X86_RegCall)
- return 2;
- // FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
- Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
- return 1;
+ Subtarget.hasAVX512()) {
+ unsigned NumElts = VT.getVectorNumElements();
+
+ MVT RegisterVT;
+ unsigned NumRegisters;
+ std::tie(RegisterVT, NumRegisters) =
+ handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+ if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+ return NumRegisters;
+ }
+
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
@@ -2140,8 +2135,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
Subtarget.hasAVX512() &&
(!isPowerOf2_32(VT.getVectorNumElements()) ||
- (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
- (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
+ (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
+ VT.getVectorNumElements() > 64)) {
RegisterVT = MVT::i8;
IntermediateVT = MVT::i1;
NumIntermediates = VT.getVectorNumElements();
@@ -2151,7 +2146,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
// Split v64i1 vectors if we don't have v64i8 available.
if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
CC != CallingConv::X86_RegCall) {
- RegisterVT = MVT::v32i1;
+ RegisterVT = MVT::v32i8;
IntermediateVT = MVT::v32i1;
NumIntermediates = 2;
return 2;
@@ -2194,20 +2189,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
/// Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
-static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
if (MaxAlign == 16)
return;
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
- if (VTy->getBitWidth() == 128)
- MaxAlign = 16;
+ if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
+ MaxAlign = Align(16);
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
- unsigned EltAlign = 0;
+ Align EltAlign;
getMaxByValAlign(ATy->getElementType(), EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
for (auto *EltTy : STy->elements()) {
- unsigned EltAlign = 0;
+ Align EltAlign;
getMaxByValAlign(EltTy, EltAlign);
if (EltAlign > MaxAlign)
MaxAlign = EltAlign;
@@ -2225,46 +2220,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const {
if (Subtarget.is64Bit()) {
// Max of 8 and alignment of type.
- unsigned TyAlign = DL.getABITypeAlignment(Ty);
+ Align TyAlign = DL.getABITypeAlign(Ty);
if (TyAlign > 8)
- return TyAlign;
+ return TyAlign.value();
return 8;
}
- unsigned Align = 4;
+ Align Alignment(4);
if (Subtarget.hasSSE1())
- getMaxByValAlign(Ty, Align);
- return Align;
-}
-
-/// Returns the target specific optimal type for load
-/// and store operations as a result of memset, memcpy, and memmove
-/// lowering. If DstAlign is zero that means it's safe to destination
-/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-/// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If 'IsMemset' is
-/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-/// source is constant so it does not need to be loaded.
+ getMaxByValAlign(Ty, Alignment);
+ return Alignment.value();
+}
+
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
/// For vector ops we check that the overall size isn't larger than our
/// preferred vector width.
EVT X86TargetLowering::getOptimalMemOpType(
- uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
- bool ZeroMemset, bool MemcpyStrSrc,
- const AttributeList &FuncAttributes) const {
+ const MemOp &Op, const AttributeList &FuncAttributes) const {
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
- if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
- ((DstAlign == 0 || DstAlign >= 16) &&
- (SrcAlign == 0 || SrcAlign >= 16)))) {
+ if (Op.size() >= 16 &&
+ (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
// FIXME: Check if unaligned 64-byte accesses are slow.
- if (Size >= 64 && Subtarget.hasAVX512() &&
+ if (Op.size() >= 64 && Subtarget.hasAVX512() &&
(Subtarget.getPreferVectorWidth() >= 512)) {
return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
}
// FIXME: Check if unaligned 32-byte accesses are slow.
- if (Size >= 32 && Subtarget.hasAVX() &&
+ if (Op.size() >= 32 && Subtarget.hasAVX() &&
(Subtarget.getPreferVectorWidth() >= 256)) {
// Although this isn't a well-supported type for AVX1, we'll let
// legalization and shuffle lowering produce the optimal codegen. If we
@@ -2280,8 +2263,8 @@ EVT X86TargetLowering::getOptimalMemOpType(
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
(Subtarget.getPreferVectorWidth() >= 128))
return MVT::v4f32;
- } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
- !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
+ } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
+ Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
// Do not use f64 to lower memcpy if source is string constant. It's
// better to use i32 to avoid the loads.
// Also, do not use f64 to lower memset unless this is a memset of zeros.
@@ -2294,7 +2277,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
// This is a compromise. If we reach here, unaligned accesses may be slow on
// this target. However, creating smaller, aligned accesses could be even
// slower and would certainly be a lot more code.
- if (Subtarget.is64Bit() && Size >= 8)
+ if (Subtarget.is64Bit() && Op.size() >= 8)
return MVT::i64;
return MVT::i32;
}
@@ -2611,7 +2594,7 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
/// Breaks v64i1 value into two registers and adds the new node to the DAG
static void Passv64i1ArgInRegs(
const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
- SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
+ SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
@@ -2656,14 +2639,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
- SDValue Flag;
- SmallVector<SDValue, 6> RetOps;
- RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
- // Operand #1 = Bytes To Pop
- RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
- MVT::i32));
-
- // Copy the result values into the output registers.
+ SmallVector<std::pair<Register, SDValue>, 4> RetVals;
for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
++I, ++OutsIndex) {
CCValAssign &VA = RVLocs[I];
@@ -2715,7 +2691,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// change the value to the FP stack register class.
if (isScalarFPTypeInSSEReg(VA.getValVT()))
ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
- RetOps.push_back(ValToCopy);
+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
// Don't emit a copytoreg.
continue;
}
@@ -2736,31 +2712,39 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
}
}
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-
if (VA.needsCustom()) {
assert(VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs");
- Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
+ Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
Subtarget);
- assert(2 == RegsToPass.size() &&
- "Expecting two registers after Pass64BitArgInRegs");
-
// Add the second register to the CalleeSaveDisableRegs list.
if (ShouldDisableCalleeSavedRegister)
MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
} else {
- RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+ RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
}
+ }
+
+ SDValue Flag;
+ SmallVector<SDValue, 6> RetOps;
+ RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+ // Operand #1 = Bytes To Pop
+ RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+ MVT::i32));
- // Add nodes to the DAG and add the values into the RetOps list
- for (auto &Reg : RegsToPass) {
- Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
- Flag = Chain.getValue(1);
- RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+ // Copy the result values into the output registers.
+ for (auto &RetVal : RetVals) {
+ if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
+ RetOps.push_back(RetVal.second);
+ continue; // Don't emit a copytoreg.
}
+
+ Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
+ Flag = Chain.getValue(1);
+ RetOps.push_back(
+ DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
}
// Swift calling convention does not require we copy the sret argument
@@ -2775,7 +2759,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
// false, then an sret argument may be implicitly inserted in the SelDAG. In
// either case FuncInfo->setSRetReturnReg() will have been called.
- if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+ if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
// When we have both sret and another return value, we should use the
// original Chain stored in RetOps[0], instead of the current Chain updated
// in the above loop. If we only have sret, RetOps[0] equals to Chain.
@@ -2798,7 +2782,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
getPointerTy(MF.getDataLayout()));
- unsigned RetValReg
+ Register RetValReg
= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
X86::RAX : X86::EAX;
Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
@@ -2924,7 +2908,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
if (nullptr == InFlag) {
// When no physical register is present,
// create an intermediate virtual register.
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
@@ -3133,10 +3117,10 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
SelectionDAG &DAG, const SDLoc &dl) {
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
- return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
- /*isVolatile*/false, /*AlwaysInline=*/true,
- /*isTailCall*/false,
- MachinePointerInfo(), MachinePointerInfo());
+ return DAG.getMemcpy(
+ Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
+ /*isVolatile*/ false, /*AlwaysInline=*/true,
+ /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
}
/// Return true if the calling convention is one that we can guarantee TCO for.
@@ -3176,8 +3160,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
if (!CI->isTailCall())
return false;
- ImmutableCallSite CS(CI);
- CallingConv::ID CalleeCC = CS.getCallingConv();
+ CallingConv::ID CalleeCC = CI->getCallingConv();
if (!mayTailCallThisCC(CalleeCC))
return false;
@@ -3341,20 +3324,223 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
#ifndef NDEBUG
static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
- return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
- [](const CCValAssign &A, const CCValAssign &B) -> bool {
- return A.getValNo() < B.getValNo();
- });
+ return llvm::is_sorted(
+ ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
+ return A.getValNo() < B.getValNo();
+ });
}
#endif
+namespace {
+/// This is a helper class for lowering variable arguments parameters.
+class VarArgsLoweringHelper {
+public:
+ VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget,
+ CallingConv::ID CallConv, CCState &CCInfo)
+ : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
+ TheMachineFunction(DAG.getMachineFunction()),
+ TheFunction(TheMachineFunction.getFunction()),
+ FrameInfo(TheMachineFunction.getFrameInfo()),
+ FrameLowering(*Subtarget.getFrameLowering()),
+ TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
+ CCInfo(CCInfo) {}
+
+ // Lower variable arguments parameters.
+ void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
+
+private:
+ void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
+
+ void forwardMustTailParameters(SDValue &Chain);
+
+ bool is64Bit() { return Subtarget.is64Bit(); }
+ bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }
+
+ X86MachineFunctionInfo *FuncInfo;
+ const SDLoc &DL;
+ SelectionDAG &DAG;
+ const X86Subtarget &Subtarget;
+ MachineFunction &TheMachineFunction;
+ const Function &TheFunction;
+ MachineFrameInfo &FrameInfo;
+ const TargetFrameLowering &FrameLowering;
+ const TargetLowering &TargLowering;
+ CallingConv::ID CallConv;
+ CCState &CCInfo;
+};
+} // namespace
+
+void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
+ SDValue &Chain, unsigned StackSize) {
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start. We
+ // can skip this if there are no va_start calls.
+ if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
+ CallConv != CallingConv::X86_ThisCall)) {
+ FuncInfo->setVarArgsFrameIndex(
+ FrameInfo.CreateFixedObject(1, StackSize, true));
+ }
+
+ // Figure out if XMM registers are in use.
+ assert(!(Subtarget.useSoftFloat() &&
+ TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ // 64-bit calling conventions support varargs and register parameters, so we
+ // have to do extra work to spill them in the prologue.
+ if (is64Bit()) {
+ // Find the first unallocated argument registers.
+ ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+ ArrayRef<MCPhysReg> ArgXMMs =
+ get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+
+ assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
+ "SSE register cannot be used when SSE is disabled!");
+
+ if (isWin64()) {
+ // Get to the caller-allocated home save location. Add 8 to account
+ // for the return address.
+ int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
+ FuncInfo->setRegSaveFrameIndex(
+ FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+ // Fixup to set vararg frame on shadow area (4 x i64).
+ if (NumIntRegs < 4)
+ FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+ } else {
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so
+ // they may be loaded by dereferencing the result of va_next.
+ FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+ FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+ FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
+ ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
+ }
+
+ SmallVector<SDValue, 6>
+ LiveGPRs; // list of SDValue for GPR registers keeping live input value
+ SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
+ // keeping live input value
+ SDValue ALVal; // if applicable keeps SDValue for %al register
+
+ // Gather all the live in physical registers.
+ for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+ Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
+ LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
+ }
+ const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
+ if (!AvailableXmms.empty()) {
+ Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+ ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
+ for (MCPhysReg Reg : AvailableXmms) {
+ Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
+ LiveXMMRegs.push_back(
+ DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
+ }
+ }
+
+ // Store the integer parameter registers.
+ SmallVector<SDValue, 8> MemOps;
+ SDValue RSFIN =
+ DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+ TargLowering.getPointerTy(DAG.getDataLayout()));
+ unsigned Offset = FuncInfo->getVarArgsGPOffset();
+ for (SDValue Val : LiveGPRs) {
+ SDValue FIN = DAG.getNode(ISD::ADD, DL,
+ TargLowering.getPointerTy(DAG.getDataLayout()),
+ RSFIN, DAG.getIntPtrConstant(Offset, DL));
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ FuncInfo->getRegSaveFrameIndex(), Offset));
+ MemOps.push_back(Store);
+ Offset += 8;
+ }
+
+ // Now store the XMM (fp + vector) parameter registers.
+ if (!LiveXMMRegs.empty()) {
+ SmallVector<SDValue, 12> SaveXMMOps;
+ SaveXMMOps.push_back(Chain);
+ SaveXMMOps.push_back(ALVal);
+ SaveXMMOps.push_back(
+ DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
+ SaveXMMOps.push_back(
+ DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
+ SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+ LiveXMMRegs.end());
+ MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
+ MVT::Other, SaveXMMOps));
+ }
+
+ if (!MemOps.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+ }
+}
+
+void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
+ // Find the largest legal vector type.
+ MVT VecVT = MVT::Other;
+ // FIXME: Only some x86_32 calling conventions support AVX512.
+ if (Subtarget.useAVX512Regs() &&
+ (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
+ CallConv == CallingConv::Intel_OCL_BI)))
+ VecVT = MVT::v16f32;
+ else if (Subtarget.hasAVX())
+ VecVT = MVT::v8f32;
+ else if (Subtarget.hasSSE2())
+ VecVT = MVT::v4f32;
+
+ // We forward some GPRs and some vector types.
+ SmallVector<MVT, 2> RegParmTypes;
+ MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
+ RegParmTypes.push_back(IntVT);
+ if (VecVT != MVT::Other)
+ RegParmTypes.push_back(VecVT);
+
+ // Compute the set of forwarded registers. The rest are scratch.
+ SmallVectorImpl<ForwardedRegister> &Forwards =
+ FuncInfo->getForwardedMustTailRegParms();
+ CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+ // Forward AL for SysV x86_64 targets, since it is used for varargs.
+ if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
+ Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+ Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+ }
+
+ // Copy all forwards from physical to virtual registers.
+ for (ForwardedRegister &FR : Forwards) {
+ // FIXME: Can we use a less constrained schedule?
+ SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
+ FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
+ TargLowering.getRegClassFor(FR.VT));
+ Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
+ }
+}
+
+void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
+ unsigned StackSize) {
+ // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
+ // If necessary, it would be set into the correct value later.
+ FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+ FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+
+ if (FrameInfo.hasVAStart())
+ createVarArgAreaAndStoreRegisters(Chain, StackSize);
+
+ if (FrameInfo.hasMustTailInVarArgFunc())
+ forwardMustTailParameters(Chain);
+}
+
SDValue X86TargetLowering::LowerFormalArguments(
- SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+ SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
const Function &F = MF.getFunction();
if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
@@ -3366,16 +3552,16 @@ SDValue X86TargetLowering::LowerFormalArguments(
bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
assert(
- !(isVarArg && canGuaranteeTCO(CallConv)) &&
+ !(IsVarArg && canGuaranteeTCO(CallConv)) &&
"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
// Allocate shadow area for Win64.
if (IsWin64)
- CCInfo.AllocateStack(32, 8);
+ CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeArguments(Ins, CC_X86);
@@ -3446,7 +3632,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
else
llvm_unreachable("Unknown argument type!");
- unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+ Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
}
@@ -3500,7 +3686,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// the argument into a virtual register so that we can access it from the
// return points.
if (Ins[I].Flags.isSRet()) {
- unsigned Reg = FuncInfo->getSRetReturnReg();
+ Register Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
MVT PtrTy = getPointerTy(DAG.getDataLayout());
Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
@@ -3518,147 +3704,12 @@ SDValue X86TargetLowering::LowerFormalArguments(
MF.getTarget().Options.GuaranteedTailCallOpt))
StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
- // If the function takes variable number of arguments, make a frame index for
- // the start of the first vararg value... for expansion of llvm.va_start. We
- // can skip this if there are no va_start calls.
- if (MFI.hasVAStart() &&
- (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
- CallConv != CallingConv::X86_ThisCall))) {
- FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
- }
-
- // Figure out if XMM registers are in use.
- assert(!(Subtarget.useSoftFloat() &&
- F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
- "SSE register cannot be used when SSE is disabled!");
-
- // 64-bit calling conventions support varargs and register parameters, so we
- // have to do extra work to spill them in the prologue.
- if (Is64Bit && isVarArg && MFI.hasVAStart()) {
- // Find the first unallocated argument registers.
- ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
- ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
- unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
- unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
- assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
- "SSE register cannot be used when SSE is disabled!");
-
- // Gather all the live in physical registers.
- SmallVector<SDValue, 6> LiveGPRs;
- SmallVector<SDValue, 8> LiveXMMRegs;
- SDValue ALVal;
- for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
- unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
- LiveGPRs.push_back(
- DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
- }
- if (!ArgXMMs.empty()) {
- unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
- ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
- for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
- unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
- LiveXMMRegs.push_back(
- DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
- }
- }
-
- if (IsWin64) {
- // Get to the caller-allocated home save location. Add 8 to account
- // for the return address.
- int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
- FuncInfo->setRegSaveFrameIndex(
- MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
- // Fixup to set vararg frame on shadow area (4 x i64).
- if (NumIntRegs < 4)
- FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
- } else {
- // For X86-64, if there are vararg parameters that are passed via
- // registers, then we must store them to their spots on the stack so
- // they may be loaded by dereferencing the result of va_next.
- FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
- FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
- FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
- ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
- }
-
- // Store the integer parameter registers.
- SmallVector<SDValue, 8> MemOps;
- SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
- getPointerTy(DAG.getDataLayout()));
- unsigned Offset = FuncInfo->getVarArgsGPOffset();
- for (SDValue Val : LiveGPRs) {
- SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
- RSFIN, DAG.getIntPtrConstant(Offset, dl));
- SDValue Store =
- DAG.getStore(Val.getValue(1), dl, Val, FIN,
- MachinePointerInfo::getFixedStack(
- DAG.getMachineFunction(),
- FuncInfo->getRegSaveFrameIndex(), Offset));
- MemOps.push_back(Store);
- Offset += 8;
- }
-
- if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
- // Now store the XMM (fp + vector) parameter registers.
- SmallVector<SDValue, 12> SaveXMMOps;
- SaveXMMOps.push_back(Chain);
- SaveXMMOps.push_back(ALVal);
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getRegSaveFrameIndex(), dl));
- SaveXMMOps.push_back(DAG.getIntPtrConstant(
- FuncInfo->getVarArgsFPOffset(), dl));
- SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
- LiveXMMRegs.end());
- MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
- MVT::Other, SaveXMMOps));
- }
-
- if (!MemOps.empty())
- Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
- }
-
- if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
- // Find the largest legal vector type.
- MVT VecVT = MVT::Other;
- // FIXME: Only some x86_32 calling conventions support AVX512.
- if (Subtarget.useAVX512Regs() &&
- (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
- CallConv == CallingConv::Intel_OCL_BI)))
- VecVT = MVT::v16f32;
- else if (Subtarget.hasAVX())
- VecVT = MVT::v8f32;
- else if (Subtarget.hasSSE2())
- VecVT = MVT::v4f32;
-
- // We forward some GPRs and some vector types.
- SmallVector<MVT, 2> RegParmTypes;
- MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
- RegParmTypes.push_back(IntVT);
- if (VecVT != MVT::Other)
- RegParmTypes.push_back(VecVT);
-
- // Compute the set of forwarded registers. The rest are scratch.
- SmallVectorImpl<ForwardedRegister> &Forwards =
- FuncInfo->getForwardedMustTailRegParms();
- CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
-
- // Forward AL for SysV x86_64 targets, since it is used for varargs.
- if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
- unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
- Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
- }
-
- // Copy all forwards from physical to virtual registers.
- for (ForwardedRegister &FR : Forwards) {
- // FIXME: Can we use a less constrained schedule?
- SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
- FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
- Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
- }
- }
+ if (IsVarArg)
+ VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
+ .lowerVarArgsParameters(Chain, StackSize);
// Some CCs need callee pop.
- if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+ if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
MF.getTarget().Options.GuaranteedTailCallOpt)) {
FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
@@ -3677,10 +3728,6 @@ SDValue X86TargetLowering::LowerFormalArguments(
if (!Is64Bit) {
// RegSaveFrameIndex is X86-64 only.
FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
- if (CallConv == CallingConv::X86_FastCall ||
- CallConv == CallingConv::X86_ThisCall)
- // fastcc functions can't have varargs.
- FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
}
FuncInfo->setArgumentStackSize(StackSize);
@@ -3697,7 +3744,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// same, so the size of funclets' (mostly empty) frames is dictated by
// how far this slot is from the bottom (since they allocate just enough
// space to accommodate holding this slot at the correct offset).
- int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
+ int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);
EHInfo->PSPSymFrameIdx = PSPSymFI;
}
}
@@ -3705,7 +3752,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
if (CallConv == CallingConv::X86_RegCall ||
F.hasFnAttribute("no_caller_saved_registers")) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
+ for (std::pair<Register, Register> Pair : MRI.liveins())
MRI.disableCalleeSavedRegister(Pair.first);
}
@@ -3716,12 +3763,13 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
SDValue Arg, const SDLoc &dl,
SelectionDAG &DAG,
const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const {
+ ISD::ArgFlagsTy Flags,
+ bool isByVal) const {
unsigned LocMemOffset = VA.getLocMemOffset();
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
StackPtr, PtrOff);
- if (Flags.isByVal())
+ if (isByVal)
return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
return DAG.getStore(
@@ -3796,18 +3844,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
- const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
+ const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
- const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
+ const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
bool HasNoCfCheck =
(CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
const Module *M = MF.getMMI().getModule();
Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
MachineFunction::CallSiteInfo CSInfo;
-
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
@@ -3823,7 +3870,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
isTailCall = false;
}
- bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
+ bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
// that we can lower this successfully without moving the return address
@@ -3854,7 +3901,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Allocate shadow area for Win64.
if (IsWin64)
- CCInfo.AllocateStack(32, 8);
+ CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeArguments(Outs, CC_X86);
@@ -3900,6 +3947,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (ArgLocs.back().getLocMemOffset() != 0)
report_fatal_error("any parameter with the inalloca attribute must be "
"the only memory argument");
+ } else if (CLI.IsPreallocated) {
+ assert(ArgLocs.back().isMemLoc() &&
+ "cannot use preallocated attribute on a register "
+ "parameter");
+ SmallVector<size_t, 4> PreallocatedOffsets;
+ for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
+ if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
+ PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
+ }
+ }
+ auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
+ MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
+ MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
+ NumBytesToPush = 0;
}
if (!IsSibcall && !IsMustTail)
@@ -3912,7 +3974,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
Is64Bit, FPDiff, dl);
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
SDValue StackPtr;
@@ -3927,9 +3989,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
++I, ++OutIndex) {
assert(OutIndex < Outs.size() && "Invalid Out index");
- // Skip inalloca arguments, they have already been written.
+ // Skip inalloca/preallocated arguments, they have already been written.
ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
- if (Flags.isInAlloca())
+ if (Flags.isInAlloca() || Flags.isPreallocated())
continue;
CCValAssign &VA = ArgLocs[I];
@@ -3968,8 +4030,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// the caller from seeing any modifications the callee may make
// as guaranteed by the `byval` attribute.
int FrameIdx = MF.getFrameInfo().CreateStackObject(
- Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
- false);
+ Flags.getByValSize(),
+ std::max(Align(16), Flags.getNonZeroByValAlign()), false);
SDValue StackSlot =
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
Chain =
@@ -3998,12 +4060,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else if (VA.isRegLoc()) {
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
const TargetOptions &Options = DAG.getTarget().Options;
- if (Options.EnableDebugEntryValues)
+ if (Options.EmitCallSiteInfo)
CSInfo.emplace_back(VA.getLocReg(), I);
if (isVarArg && IsWin64) {
// Win64 ABI requires argument XMM reg to be copied to the corresponding
// shadow reg if callee is a varargs function.
- unsigned ShadowReg = 0;
+ Register ShadowReg;
switch (VA.getLocReg()) {
case X86::XMM0: ShadowReg = X86::RCX; break;
case X86::XMM1: ShadowReg = X86::RDX; break;
@@ -4019,7 +4081,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
getPointerTy(DAG.getDataLayout()));
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
- dl, DAG, VA, Flags));
+ dl, DAG, VA, Flags, isByVal));
}
}
@@ -4031,7 +4093,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// GOT pointer.
if (!isTailCall) {
RegsToPass.push_back(std::make_pair(
- unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+ Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
getPointerTy(DAG.getDataLayout()))));
} else {
// If we are tail calling and generating PIC/GOT style code load the
@@ -4069,8 +4131,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
assert((Subtarget.hasSSE1() || !NumXMMRegs)
&& "SSE registers cannot be used when SSE is disabled");
-
- RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+ RegsToPass.push_back(std::make_pair(Register(X86::AL),
DAG.getConstant(NumXMMRegs, dl,
MVT::i8)));
}
@@ -4079,7 +4140,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
const auto &Forwards = X86Info->getForwardedMustTailRegParms();
for (const auto &F : Forwards) {
SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
- RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+ RegsToPass.push_back(std::make_pair(F.PReg, Val));
}
}
@@ -4117,8 +4178,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
assert(VA.isMemLoc());
SDValue Arg = OutVals[OutsIndex];
ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
- // Skip inalloca arguments. They don't require any work.
- if (Flags.isInAlloca())
+ // Skip inalloca/preallocated arguments. They don't require any work.
+ if (Flags.isInAlloca() || Flags.isPreallocated())
continue;
// Create frame index.
int32_t Offset = VA.getLocMemOffset()+FPDiff;
@@ -4219,7 +4280,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
- if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
+ if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
const Function &CallerFn = MF.getFunction();
EHPersonality Pers =
CallerFn.hasPersonalityFn()
@@ -4278,11 +4339,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
}
InFlag = Chain.getValue(1);
+ DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
// Save heapallocsite metadata.
- if (CLI.CS)
- if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+ if (CLI.CB)
+ if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
// Create the CALLSEQ_END node.
@@ -4301,12 +4363,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
else
NumBytesForCalleeToPop = 0; // Callee pops nothing.
- if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
- // No need to reset the stack after the call if the call doesn't return. To
- // make the MI verify, we'll pretend the callee does it for us.
- NumBytesForCalleeToPop = NumBytes;
- }
-
// Returns a flag for retval copy to use.
if (!IsSibcall) {
Chain = DAG.getCALLSEQ_END(Chain,
@@ -4337,7 +4393,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// (within module) calls are supported at the moment.
// To keep the stack aligned according to platform abi the function
// GetAlignedArgumentStackSize ensures that argument delta is always multiples
-// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
// If a tail called function callee has more arguments than the caller the
// caller needs to make sure that there is room to move the RETADDR to. This is
// achieved by reserving an area the size of the argument delta right after the
@@ -4359,7 +4415,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
unsigned
X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
SelectionDAG &DAG) const {
- const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
+ const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
assert(StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize");
@@ -4395,7 +4451,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
int FI = INT_MAX;
if (Arg.getOpcode() == ISD::CopyFromReg) {
- unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+ Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
if (!Register::isVirtualRegister(VR))
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
@@ -4578,7 +4634,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// Allocate shadow area for Win64
if (IsCalleeWin64)
- CCInfo.AllocateStack(32, 8);
+ CCInfo.AllocateStack(32, Align(8));
CCInfo.AnalyzeCallOperands(Outs, CC_X86);
StackArgsSize = CCInfo.getNextStackOffset();
@@ -4693,6 +4749,7 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::INSERTPS:
case X86ISD::EXTRQI:
case X86ISD::INSERTQI:
+ case X86ISD::VALIGN:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -4739,6 +4796,13 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
}
}
+static bool isTargetShuffleSplat(SDValue Op) {
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::EXTRACT_SUBVECTOR)
+ return isTargetShuffleSplat(Op.getOperand(0));
+ return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
+}
+
SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -4972,7 +5036,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
ScalarVT = MVT::i32;
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -4985,7 +5049,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags |= MachineMemOperand::MOLoad;
break;
}
@@ -4997,7 +5061,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
unsigned NumElts = std::min(DataVT.getVectorNumElements(),
IndexVT.getVectorNumElements());
Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
- Info.align = Align::None();
+ Info.align = Align(1);
Info.flags |= MachineMemOperand::MOStore;
break;
}
@@ -5146,7 +5210,8 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
}
-bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool) const {
// TODO: Allow vectors?
if (VT.isVector())
return false;
@@ -5374,6 +5439,19 @@ static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
}
+/// Return true if the value of any element in Mask is the zero sentinel value.
+static bool isAnyZero(ArrayRef<int> Mask) {
+ return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+}
+
+/// Return true if the value of any element in Mask is the zero or undef
+/// sentinel values.
+static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
+ return llvm::any_of(Mask, [](int M) {
+ return M == SM_SentinelZero || M == SM_SentinelUndef;
+ });
+}
+
/// Return true if Val is undef or if its value falls within the
/// specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -5511,6 +5589,36 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask) {
return canWidenShuffleElements(Mask, WidenedMask);
}
+// Attempt to narrow/widen shuffle mask until it matches the target number of
+// elements.
+static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
+ SmallVectorImpl<int> &ScaledMask) {
+ unsigned NumSrcElts = Mask.size();
+ assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
+ "Illegal shuffle scale factor");
+
+ // Narrowing is guaranteed to work.
+ if (NumDstElts >= NumSrcElts) {
+ int Scale = NumDstElts / NumSrcElts;
+ llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
+ return true;
+ }
+
+ // We have to repeat the widening until we reach the target size, but we can
+ // split out the first widening as it sets up ScaledMask for us.
+ if (canWidenShuffleElements(Mask, ScaledMask)) {
+ while (ScaledMask.size() > NumDstElts) {
+ SmallVector<int, 16> WidenedMask;
+ if (!canWidenShuffleElements(ScaledMask, WidenedMask))
+ return false;
+ ScaledMask = std::move(WidenedMask);
+ }
+ return true;
+ }
+
+ return false;
+}
+
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -5725,7 +5833,7 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
}
-// Helper function to collect subvector ops that are concated together,
+// Helper function to collect subvector ops that are concatenated together,
// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
// The subvectors in Ops are guaranteed to be the same type.
static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
@@ -5736,8 +5844,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
return true;
}
- if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
- isa<ConstantSDNode>(N->getOperand(2))) {
+ if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue Src = N->getOperand(0);
SDValue Sub = N->getOperand(1);
const APInt &Idx = N->getConstantOperandAPInt(2);
@@ -5746,19 +5853,93 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
// TODO - Handle more general insert_subvector chains.
if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
- Idx == (VT.getVectorNumElements() / 2) &&
- Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Src.getOperand(1).getValueType() == SubVT &&
- isNullConstant(Src.getOperand(2))) {
- Ops.push_back(Src.getOperand(1));
- Ops.push_back(Sub);
- return true;
+ Idx == (VT.getVectorNumElements() / 2)) {
+ // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ // insert_subvector(x, extract_subvector(x, lo), hi)
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+ Ops.append(2, Sub);
+ return true;
+ }
}
}
return false;
}
+static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
+ const SDLoc &dl) {
+ EVT VT = Op.getValueType();
+ unsigned NumElems = VT.getVectorNumElements();
+ unsigned SizeInBits = VT.getSizeInBits();
+ assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
+ "Can't split odd sized vector");
+
+ SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
+ SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
+ return std::make_pair(Lo, Hi);
+}
+
+// Split an unary integer op into 2 half sized ops.
+static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Make sure we only try to split 256/512-bit types to avoid creating
+ // narrow vectors.
+ assert((Op.getOperand(0).getValueType().is256BitVector() ||
+ Op.getOperand(0).getValueType().is512BitVector()) &&
+ (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+ assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
+ VT.getVectorNumElements() &&
+ "Unexpected VTs!");
+
+ SDLoc dl(Op);
+
+ // Extract the Lo/Hi vectors
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
+ DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
+}
+
+/// Break a binary integer operation into 2 half sized ops and then
+/// concatenate the result back.
+static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+
+ // Sanity check that all the types match.
+ assert(Op.getOperand(0).getValueType() == VT &&
+ Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
+ assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+
+ SDLoc dl(Op);
+
+ // Extract the LHS Lo/Hi vectors
+ SDValue LHS1, LHS2;
+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
+
+ // Extract the RHS Lo/Hi vectors
+ SDValue RHS1, RHS2;
+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
+
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
+ DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
+}
+
// Helper for splitting operands of an operation to legal target size and
// apply a function on each part.
// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
@@ -5815,21 +5996,17 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
SDValue Vec = Op.getOperand(0);
SDValue SubVec = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
-
- if (!isa<ConstantSDNode>(Idx))
- return SDValue();
+ unsigned IdxVal = Op.getConstantOperandVal(2);
// Inserting undef is a nop. We can just return the original vector.
if (SubVec.isUndef())
return Vec;
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
return Op;
MVT OpVT = Op.getSimpleValueType();
unsigned NumElems = OpVT.getVectorNumElements();
-
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
// Extend to natively supported kshift.
@@ -5849,7 +6026,6 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
MVT SubVecVT = SubVec.getSimpleValueType();
unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
-
assert(IdxVal + SubVecNumElems <= NumElems &&
IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR");
@@ -5900,7 +6076,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
- // isel to opimitize when bits are known zero.
+ // isel to optimize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
DAG.getConstant(0, dl, WideOpVT),
@@ -6042,8 +6218,8 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
// Match (xor X, -1) -> X.
// Match extract_subvector(xor X, -1) -> extract_subvector(X).
// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
- V = peekThroughBitcasts(V);
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
+ V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
return V.getOperand(0);
@@ -6067,6 +6243,35 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
return SDValue();
}
+void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+ bool Lo, bool Unary) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+ Pos += (Unary ? 0 : NumElts * (i % 2));
+ Pos += (Lo ? 0 : NumEltsInLane / 2);
+ Mask.push_back(Pos);
+ }
+}
+
+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+/// imposed by AVX and specific to the unary pattern. Example:
+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+ bool Lo) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ for (int i = 0; i < NumElts; ++i) {
+ int Pos = i / 2;
+ Pos += (Lo ? 0 : NumElts / 2);
+ Mask.push_back(Pos);
+ }
+}
+
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
@@ -6102,14 +6307,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
}
-static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
- if (!Load || !ISD::isNormalLoad(Load))
- return nullptr;
-
- SDValue Ptr = Load->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
+static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
+ if (Ptr.getOpcode() == X86ISD::Wrapper ||
+ Ptr.getOpcode() == X86ISD::WrapperRIP)
+ Ptr = Ptr.getOperand(0);
auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
@@ -6118,6 +6319,12 @@ static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
return CNode->getConstVal();
}
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
+ if (!Load || !ISD::isNormalLoad(Load))
+ return nullptr;
+ return getTargetConstantFromBasePtr(Load->getBasePtr());
+}
+
static const Constant *getTargetConstantFromNode(SDValue Op) {
Op = peekThroughBitcasts(Op);
return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
@@ -6298,23 +6505,6 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Extract constant bits from a broadcasted constant pool scalar.
- if (Op.getOpcode() == X86ISD::VBROADCAST &&
- EltSizeInBits <= VT.getScalarSizeInBits()) {
- if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
- unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
- unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
-
- APInt UndefSrcElts(NumSrcElts, 0);
- SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
- if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
- if (UndefSrcElts[0])
- UndefSrcElts.setBits(0, NumSrcElts);
- SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
- return CastBitData(UndefSrcElts, SrcEltBits);
- }
- }
- }
-
if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
EltSizeInBits <= VT.getScalarSizeInBits()) {
auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
@@ -6322,16 +6512,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return false;
SDValue Ptr = MemIntr->getBasePtr();
- if (Ptr->getOpcode() == X86ISD::Wrapper ||
- Ptr->getOpcode() == X86ISD::WrapperRIP)
- Ptr = Ptr->getOperand(0);
-
- auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
- if (!CNode || CNode->isMachineConstantPoolEntry() ||
- CNode->getOffset() != 0)
- return false;
-
- if (const Constant *C = CNode->getConstVal()) {
+ if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
@@ -6375,8 +6556,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Insert constant bits from a base and sub vector sources.
- if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
- isa<ConstantSDNode>(Op.getOperand(2))) {
+ if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
// TODO - support insert_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
@@ -6398,8 +6578,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
}
// Extract constant bits from a subvector's source.
- if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- isa<ConstantSDNode>(Op.getOperand(1))) {
+ if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
// TODO - support extract_subvector through bitcasts.
if (EltSizeInBits != VT.getScalarSizeInBits())
return false;
@@ -6468,11 +6647,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
namespace llvm {
namespace X86 {
-bool isConstantSplat(SDValue Op, APInt &SplatVal) {
+bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
APInt UndefElts;
SmallVector<APInt, 16> EltBits;
if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
- UndefElts, EltBits, true, false)) {
+ UndefElts, EltBits, true,
+ AllowPartialUndefs)) {
int SplatIndex = -1;
for (int i = 0, e = EltBits.size(); i != e; ++i) {
if (UndefElts[i])
@@ -6513,20 +6693,26 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
}
/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
+/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
/// Note: This ignores saturation, so inputs must be checked first.
static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
- bool Unary) {
+ bool Unary, unsigned NumStages = 1) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits() / 128;
unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
unsigned Offset = Unary ? 0 : NumElts;
+ unsigned Repetitions = 1u << (NumStages - 1);
+ unsigned Increment = 1u << NumStages;
+ assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
- Mask.push_back(Elt + (Lane * NumEltsPerLane));
- for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
- Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+ for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane));
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+ }
}
}
@@ -6597,7 +6783,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
unsigned MaskEltSize = VT.getScalarSizeInBits();
SmallVector<uint64_t, 32> RawMask;
APInt RawUndefs;
- SDValue ImmN;
+ uint64_t ImmN;
assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
@@ -6608,23 +6794,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::BLENDI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeBLENDMask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeSHUFPMask(NumElems, MaskEltSize,
- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::INSERTPS:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeINSERTPSMask(ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::EXTRQI:
@@ -6672,13 +6857,23 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeMOVLHPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
+ case X86ISD::VALIGN:
+ assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+ "Only 32-bit and 64-bit elements are supported!");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVALIGNMask(NumElems, ImmN, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ Ops.push_back(N->getOperand(1));
+ Ops.push_back(N->getOperand(0));
+ break;
case X86ISD::PALIGNR:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePALIGNRMask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
Ops.push_back(N->getOperand(1));
Ops.push_back(N->getOperand(0));
@@ -6686,39 +6881,34 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::VSHLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSLLDQMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::VSRLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSRLDQMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSHUFMask(NumElems, MaskEltSize,
- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::PSHUFHW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFHWMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodePSHUFLWMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::VZEXT_MOVL:
@@ -6770,8 +6960,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
}
case X86ISD::VPERMI:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVPERMMask(NumElems, ImmN, Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
@@ -6783,17 +6973,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
case X86ISD::VPERM2X128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
- Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUF128:
assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
- ImmN = N->getOperand(N->getNumOperands() - 1);
- decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
- cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+ decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVSLDUP:
@@ -6875,9 +7063,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
// Check if we're getting a shuffle mask with zero'd elements.
- if (!AllowSentinelZero)
- if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
- return false;
+ if (!AllowSentinelZero && isAnyZero(Mask))
+ return false;
// If we have a fake unary shuffle, the shuffle mask is spread across two
// inputs that are actually the same node. Re-map the mask to always point
@@ -7060,6 +7247,20 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
continue;
}
+ // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
+ // base vectors.
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+ SDValue Vec = V.getOperand(0);
+ int NumVecElts = Vec.getValueType().getVectorNumElements();
+ if (Vec.isUndef() && Size == NumVecElts) {
+ int Idx = V.getConstantOperandVal(2);
+ int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
+ if (M < Idx || (Idx + NumSubElts) <= M)
+ KnownUndef.setBit(i);
+ }
+ continue;
+ }
+
// Attempt to extract from the source's constant bits.
if (IsSrcConstant[SrcIdx]) {
if (UndefSrcElts[SrcIdx][M])
@@ -7111,7 +7312,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
// TODO: Use DemandedElts variant.
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG, unsigned Depth,
+ const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts);
// Attempt to decode ops that could be represented as a shuffle mask.
@@ -7120,7 +7321,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
- SelectionDAG &DAG, unsigned Depth,
+ const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
Mask.clear();
Ops.clear();
@@ -7132,6 +7333,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
return false;
assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
+ unsigned NumSizeInBytes = NumSizeInBits / 8;
+ unsigned NumBytesPerElt = NumBitsPerElt / 8;
unsigned Opcode = N.getOpcode();
switch (Opcode) {
@@ -7179,8 +7382,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
bool IsByteMask = true;
- unsigned NumSizeInBytes = NumSizeInBits / 8;
- unsigned NumBytesPerElt = NumBitsPerElt / 8;
APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
@@ -7220,10 +7421,21 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
!getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
true))
return false;
+
+ // Shuffle inputs must be the same size as the result.
+ if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
+ return VT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return false;
+ if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
+ return VT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return false;
+
size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
SmallVector<int, 64> Mask0, Mask1;
- scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
- scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
+ narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
+ narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
for (size_t i = 0; i != MaskSize; ++i) {
if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
Mask.push_back(SM_SentinelUndef);
@@ -7245,14 +7457,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
SDValue Sub = N.getOperand(1);
EVT SubVT = Sub.getValueType();
unsigned NumSubElts = SubVT.getVectorNumElements();
- if (!isa<ConstantSDNode>(N.getOperand(2)) ||
- !N->isOnlyUserOf(Sub.getNode()))
+ if (!N->isOnlyUserOf(Sub.getNode()))
return false;
uint64_t InsertIdx = N.getConstantOperandVal(2);
// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Sub.getOperand(0).getValueType() == VT &&
- isa<ConstantSDNode>(Sub.getOperand(1))) {
+ Sub.getOperand(0).getValueType() == VT) {
uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
@@ -7268,13 +7478,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
SubMask, DAG, Depth + 1, ResolveKnownElts))
return false;
+
+ // Subvector shuffle inputs must not be larger than the subvector.
+ if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
+ return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
+ }))
+ return false;
+
if (SubMask.size() != NumSubElts) {
assert(((SubMask.size() % NumSubElts) == 0 ||
(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
if ((NumSubElts % SubMask.size()) == 0) {
int Scale = NumSubElts / SubMask.size();
SmallVector<int,64> ScaledSubMask;
- scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
+ narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
SubMask = ScaledSubMask;
} else {
int Scale = SubMask.size() / NumSubElts;
@@ -7284,14 +7501,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
}
Ops.push_back(Src);
- for (SDValue &SubInput : SubInputs) {
- EVT SubSVT = SubInput.getValueType().getScalarType();
- EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
- NumSizeInBits / SubSVT.getSizeInBits());
- Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
- DAG.getUNDEF(AltVT), SubInput,
- DAG.getIntPtrConstant(0, SDLoc(N))));
- }
+ Ops.append(SubInputs.begin(), SubInputs.end());
for (int i = 0; i != (int)NumElts; ++i)
Mask.push_back(i);
for (int i = 0; i != (int)NumSubElts; ++i) {
@@ -7304,75 +7514,83 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
}
return true;
}
- case ISD::SCALAR_TO_VECTOR: {
- // Match against a scalar_to_vector of an extract from a vector,
- // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
- SDValue N0 = N.getOperand(0);
- SDValue SrcExtract;
+ case X86ISD::PINSRB:
+ case X86ISD::PINSRW:
+ case ISD::SCALAR_TO_VECTOR:
+ case ISD::INSERT_VECTOR_ELT: {
+ // Match against a insert_vector_elt/scalar_to_vector of an extract from a
+ // vector, for matching src/dst vector types.
+ SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
+
+ unsigned DstIdx = 0;
+ if (Opcode != ISD::SCALAR_TO_VECTOR) {
+ // Check we have an in-range constant insertion index.
+ if (!isa<ConstantSDNode>(N.getOperand(2)) ||
+ N.getConstantOperandAPInt(2).uge(NumElts))
+ return false;
+ DstIdx = N.getConstantOperandVal(2);
+
+ // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
+ if (X86::isZeroNode(Scl)) {
+ Ops.push_back(N.getOperand(0));
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
+ return true;
+ }
+ }
- if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- N0.getOperand(0).getValueType() == VT) ||
- (N0.getOpcode() == X86ISD::PEXTRW &&
- N0.getOperand(0).getValueType() == MVT::v8i16) ||
- (N0.getOpcode() == X86ISD::PEXTRB &&
- N0.getOperand(0).getValueType() == MVT::v16i8)) {
- SrcExtract = N0;
+ // Peek through trunc/aext/zext.
+ // TODO: aext shouldn't require SM_SentinelZero padding.
+ // TODO: handle shift of scalars.
+ unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
+ while (Scl.getOpcode() == ISD::TRUNCATE ||
+ Scl.getOpcode() == ISD::ANY_EXTEND ||
+ Scl.getOpcode() == ISD::ZERO_EXTEND) {
+ Scl = Scl.getOperand(0);
+ MinBitsPerElt =
+ std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
}
+ if ((MinBitsPerElt % 8) != 0)
+ return false;
+ // Attempt to find the source vector the scalar was extracted from.
+ SDValue SrcExtract;
+ if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+ Scl.getOpcode() == X86ISD::PEXTRW ||
+ Scl.getOpcode() == X86ISD::PEXTRB) &&
+ Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
+ SrcExtract = Scl;
+ }
if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
return false;
SDValue SrcVec = SrcExtract.getOperand(0);
EVT SrcVT = SrcVec.getValueType();
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
- unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
-
- unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
- if (NumSrcElts <= SrcIdx)
+ if (!SrcVT.getScalarType().isByteSized())
return false;
-
- Ops.push_back(SrcVec);
- Mask.push_back(SrcIdx);
- Mask.append(NumZeros, SM_SentinelZero);
- Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
- return true;
- }
- case X86ISD::PINSRB:
- case X86ISD::PINSRW: {
- SDValue InVec = N.getOperand(0);
- SDValue InScl = N.getOperand(1);
- SDValue InIndex = N.getOperand(2);
- if (!isa<ConstantSDNode>(InIndex) ||
- cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
- return false;
- uint64_t InIdx = N.getConstantOperandVal(2);
-
- // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
- if (X86::isZeroNode(InScl)) {
- Ops.push_back(InVec);
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
- return true;
+ unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
+ unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
+ unsigned DstByte = DstIdx * NumBytesPerElt;
+ MinBitsPerElt =
+ std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
+
+ // Create 'identity' byte level shuffle mask and then add inserted bytes.
+ if (Opcode == ISD::SCALAR_TO_VECTOR) {
+ Ops.push_back(SrcVec);
+ Mask.append(NumSizeInBytes, SM_SentinelUndef);
+ } else {
+ Ops.push_back(SrcVec);
+ Ops.push_back(N.getOperand(0));
+ for (int i = 0; i != (int)NumSizeInBytes; ++i)
+ Mask.push_back(NumSizeInBytes + i);
}
- // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
- // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
- unsigned ExOp =
- (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
- if (InScl.getOpcode() != ExOp)
- return false;
-
- SDValue ExVec = InScl.getOperand(0);
- SDValue ExIndex = InScl.getOperand(1);
- if (!isa<ConstantSDNode>(ExIndex) ||
- cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
- return false;
- uint64_t ExIdx = InScl.getConstantOperandVal(1);
-
- Ops.push_back(InVec);
- Ops.push_back(ExVec);
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
+ unsigned MinBytesPerElts = MinBitsPerElt / 8;
+ MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
+ for (unsigned i = 0; i != MinBytesPerElts; ++i)
+ Mask[DstByte + i] = SrcByte + i;
+ for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
+ Mask[DstByte + i] = SM_SentinelZero;
return true;
}
case X86ISD::PACKSS:
@@ -7412,6 +7630,23 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
createPackShuffleMask(VT, Mask, IsUnary);
return true;
}
+ case X86ISD::VTRUNC: {
+ SDValue Src = N.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ // Truncated source must be a simple vector.
+ if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+ (SrcVT.getScalarSizeInBits() % 8) != 0)
+ return false;
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
+ unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
+ assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
+ for (unsigned i = 0; i != NumSrcElts; ++i)
+ Mask.push_back(i * Scale);
+ Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
+ Ops.push_back(Src);
+ return true;
+ }
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
uint64_t ShiftVal = N.getConstantOperandVal(1);
@@ -7426,40 +7661,43 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
break;
uint64_t ByteShift = ShiftVal / 8;
- unsigned NumBytes = NumSizeInBits / 8;
- unsigned NumBytesPerElt = NumBitsPerElt / 8;
Ops.push_back(N.getOperand(0));
// Clear mask to all zeros and insert the shifted byte indices.
- Mask.append(NumBytes, SM_SentinelZero);
+ Mask.append(NumSizeInBytes, SM_SentinelZero);
if (X86ISD::VSHLI == Opcode) {
- for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j] = i + j - ByteShift;
} else {
- for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
Mask[i + j - ByteShift] = i + j;
}
return true;
}
+ case X86ISD::VROTLI:
+ case X86ISD::VROTRI: {
+ // We can only decode 'whole byte' bit rotates as shuffles.
+ uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
+ if ((RotateVal % 8) != 0)
+ return false;
+ Ops.push_back(N.getOperand(0));
+ int Offset = RotateVal / 8;
+ Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
+ for (int i = 0; i != (int)NumElts; ++i) {
+ int BaseIdx = i * NumBytesPerElt;
+ for (int j = 0; j != (int)NumBytesPerElt; ++j) {
+ Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
+ }
+ }
+ return true;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
- MVT SrcVT = Src.getSimpleValueType();
- if (!SrcVT.isVector())
+ if (!Src.getSimpleValueType().isVector())
return false;
-
- if (NumSizeInBits != SrcVT.getSizeInBits()) {
- assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
- "Illegal broadcast type");
- SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
- NumSizeInBits / SrcVT.getScalarSizeInBits());
- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
- DAG.getUNDEF(SrcVT), Src,
- DAG.getIntPtrConstant(0, SDLoc(N)));
- }
-
Ops.push_back(Src);
Mask.append(NumElts, 0);
return true;
@@ -7476,22 +7714,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
(SrcVT.getScalarSizeInBits() % 8) != 0)
return false;
- unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
bool IsAnyExtend =
(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
- DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
- Mask);
-
- if (NumSizeInBits != SrcVT.getSizeInBits()) {
- assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
- "Illegal zero-extension type");
- SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
- NumSizeInBits / NumSrcBitsPerElt);
- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
- DAG.getUNDEF(SrcVT), Src,
- DAG.getIntPtrConstant(0, SDLoc(N)));
- }
-
+ DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+ IsAnyExtend, Mask);
Ops.push_back(Src);
return true;
}
@@ -7549,7 +7775,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
APInt &KnownUndef, APInt &KnownZero,
- SelectionDAG &DAG, unsigned Depth,
+ const SelectionDAG &DAG, unsigned Depth,
bool ResolveKnownElts) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
@@ -7570,7 +7796,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
SmallVectorImpl<int> &Mask,
- SelectionDAG &DAG, unsigned Depth = 0,
+ const SelectionDAG &DAG, unsigned Depth = 0,
bool ResolveKnownElts = true) {
EVT VT = Op.getValueType();
if (!VT.isSimple() || !VT.isVector())
@@ -7583,93 +7809,107 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
KnownZero, DAG, Depth, ResolveKnownElts);
}
-/// Returns the scalar element that will make up the ith
+/// Returns the scalar element that will make up the i'th
/// element of the result of the vector shuffle.
-static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
- unsigned Depth) {
- if (Depth == 6)
- return SDValue(); // Limit search depth.
+static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
+ SelectionDAG &DAG, unsigned Depth) {
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue(); // Limit search depth.
- SDValue V = SDValue(N, 0);
- EVT VT = V.getValueType();
- unsigned Opcode = V.getOpcode();
+ EVT VT = Op.getValueType();
+ unsigned Opcode = Op.getOpcode();
+ unsigned NumElems = VT.getVectorNumElements();
// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
- if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+ if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
int Elt = SV->getMaskElt(Index);
if (Elt < 0)
return DAG.getUNDEF(VT.getVectorElementType());
- unsigned NumElems = VT.getVectorNumElements();
- SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
- : SV->getOperand(1);
- return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+ SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
}
// Recurse into target specific vector shuffles to find scalars.
if (isTargetShuffle(Opcode)) {
- MVT ShufVT = V.getSimpleValueType();
+ MVT ShufVT = VT.getSimpleVT();
MVT ShufSVT = ShufVT.getVectorElementType();
int NumElems = (int)ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 16> ShuffleOps;
bool IsUnary;
- if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
+ if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
+ ShuffleMask, IsUnary))
return SDValue();
int Elt = ShuffleMask[Index];
if (Elt == SM_SentinelZero)
- return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
- : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
+ return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
+ : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
if (Elt == SM_SentinelUndef)
return DAG.getUNDEF(ShufSVT);
- assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
- SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
- return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
- Depth+1);
+ assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
+ SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+ return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
}
// Recurse into insert_subvector base/sub vector to find scalars.
- if (Opcode == ISD::INSERT_SUBVECTOR &&
- isa<ConstantSDNode>(N->getOperand(2))) {
- SDValue Vec = N->getOperand(0);
- SDValue Sub = N->getOperand(1);
- EVT SubVT = Sub.getValueType();
- unsigned NumSubElts = SubVT.getVectorNumElements();
- uint64_t SubIdx = N->getConstantOperandVal(2);
+ if (Opcode == ISD::INSERT_SUBVECTOR) {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Sub = Op.getOperand(1);
+ uint64_t SubIdx = Op.getConstantOperandVal(2);
+ unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
- return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
- return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
+ return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
+ return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
+ }
+
+ // Recurse into concat_vectors sub vector to find scalars.
+ if (Opcode == ISD::CONCAT_VECTORS) {
+ EVT SubVT = Op.getOperand(0).getValueType();
+ unsigned NumSubElts = SubVT.getVectorNumElements();
+ uint64_t SubIdx = Index / NumSubElts;
+ uint64_t SubElt = Index % NumSubElts;
+ return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
}
// Recurse into extract_subvector src vector to find scalars.
- if (Opcode == ISD::EXTRACT_SUBVECTOR &&
- isa<ConstantSDNode>(N->getOperand(1))) {
- SDValue Src = N->getOperand(0);
- uint64_t SrcIdx = N->getConstantOperandVal(1);
- return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
+ if (Opcode == ISD::EXTRACT_SUBVECTOR) {
+ SDValue Src = Op.getOperand(0);
+ uint64_t SrcIdx = Op.getConstantOperandVal(1);
+ return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
}
- // Actual nodes that may contain scalar elements
+ // We only peek through bitcasts of the same vector width.
if (Opcode == ISD::BITCAST) {
- V = V.getOperand(0);
- EVT SrcVT = V.getValueType();
- unsigned NumElems = VT.getVectorNumElements();
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
+ return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
+ return SDValue();
+ }
- if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
- return SDValue();
+ // Actual nodes that may contain scalar elements
+
+ // For insert_vector_elt - either return the index matching scalar or recurse
+ // into the base vector.
+ if (Opcode == ISD::INSERT_VECTOR_ELT &&
+ isa<ConstantSDNode>(Op.getOperand(2))) {
+ if (Op.getConstantOperandAPInt(2) == Index)
+ return Op.getOperand(1);
+ return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
}
- if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
- return (Index == 0) ? V.getOperand(0)
+ if (Opcode == ISD::SCALAR_TO_VECTOR)
+ return (Index == 0) ? Op.getOperand(0)
: DAG.getUNDEF(VT.getVectorElementType());
- if (V.getOpcode() == ISD::BUILD_VECTOR)
- return V.getOperand(Index);
+ if (Opcode == ISD::BUILD_VECTOR)
+ return Op.getOperand(Index);
return SDValue();
}
@@ -7762,10 +8002,11 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
Elt = NextElt;
}
- // If our first insertion is not the first index then insert into zero
- // vector to break any register dependency else use SCALAR_TO_VECTOR.
+ // If our first insertion is not the first index or zeros are needed, then
+ // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
+ // elements undefined).
if (!V) {
- if (i != 0)
+ if (i != 0 || NumZero)
V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
else {
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
@@ -7964,11 +8205,12 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
// FIXME: 256-bit vector instructions don't require a strict alignment,
// improve this code to support it better.
- unsigned RequiredAlign = VT.getSizeInBits()/8;
+ Align RequiredAlign(VT.getSizeInBits() / 8);
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16 or 32.
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+ MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
+ if (!InferredAlign || *InferredAlign < RequiredAlign) {
if (MFI.isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
@@ -7983,9 +8225,9 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
- if ((Offset % RequiredAlign) & 3)
+ if ((Offset % RequiredAlign.value()) & 3)
return SDValue();
- int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+ int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
if (StartOffset) {
SDLoc DL(Ptr);
Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -8024,8 +8266,8 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
case ISD::SCALAR_TO_VECTOR:
return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
case ISD::SRL:
- if (isa<ConstantSDNode>(Elt.getOperand(1))) {
- uint64_t Idx = Elt.getConstantOperandVal(1);
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
+ uint64_t Idx = IdxC->getZExtValue();
if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
ByteOffset += Idx / 8;
return true;
@@ -8033,13 +8275,13 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
}
break;
case ISD::EXTRACT_VECTOR_ELT:
- if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+ if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
SDValue Src = Elt.getOperand(0);
unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
findEltLoadSrc(Src, Ld, ByteOffset)) {
- uint64_t Idx = Elt.getConstantOperandVal(1);
+ uint64_t Idx = IdxC->getZExtValue();
ByteOffset += Idx * (SrcSizeInBits / 8);
return true;
}
@@ -8169,7 +8411,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
"Cannot merge volatile or atomic loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
- LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
+ LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
+ MMOFlags);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, NewLd);
@@ -8247,14 +8490,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
: MVT::getIntegerVT(LoadSizeInBits);
MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
+ // Allow v4f32 on SSE1 only targets.
+ // FIXME: Add more isel patterns so we can just use VT directly.
+ if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
+ VecVT = MVT::v4f32;
if (TLI.isTypeLegal(VecVT)) {
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
- LDBase->getPointerInfo(),
- LDBase->getAlignment(),
- MachineMemOperand::MOLoad);
+ SDValue ResNode = DAG.getMemIntrinsicNode(
+ X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
+ LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
for (auto *LD : Loads)
if (LD)
DAG.makeEquivalentMemoryOrdering(LD, ResNode);
@@ -8318,13 +8563,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
// are consecutive, non-overlapping, and in the right order.
-static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
+static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
bool isAfterLegalize) {
SmallVector<SDValue, 64> Elts;
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
- if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+ if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
Elts.push_back(Elt);
continue;
}
@@ -8439,7 +8684,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
SDValue Ld = BVOp->getSplatValue(&UndefElements);
// Attempt to use VBROADCASTM
- // From this paterrn:
+ // From this pattern:
// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
// b. t1 = (build_vector t0 t0)
//
@@ -8486,8 +8731,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
LLVMContext *Ctx = DAG.getContext();
MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
if (Subtarget.hasAVX()) {
- if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
- !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+ if (SplatBitSize == 32 || SplatBitSize == 64 ||
+ (SplatBitSize < 32 && Subtarget.hasAVX2())) {
// Splatted value can fit in one INTEGER constant in constant pool.
// Load the constant and broadcast it.
MVT CVT = MVT::getIntegerVT(SplatBitSize);
@@ -8496,46 +8741,25 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
SDValue CP = DAG.getConstantPool(C, PVT);
unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- Ld = DAG.getLoad(
- CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
- SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
- MVT::getVectorVT(CVT, Repeat), Ld);
- return DAG.getBitcast(VT, Brdcst);
- } else if (SplatBitSize == 32 || SplatBitSize == 64) {
- // Splatted value can fit in one FLOAT constant in constant pool.
- // Load the constant and broadcast it.
- // AVX have support for 32 and 64 bit broadcast for floats only.
- // No 64bit integer in 32bit subtarget.
- MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
- // Lower the splat via APFloat directly, to avoid any conversion.
- Constant *C =
- SplatBitSize == 32
- ? ConstantFP::get(*Ctx,
- APFloat(APFloat::IEEEsingle(), SplatValue))
- : ConstantFP::get(*Ctx,
- APFloat(APFloat::IEEEdouble(), SplatValue));
- SDValue CP = DAG.getConstantPool(C, PVT);
- unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
-
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- Ld = DAG.getLoad(
- CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
- SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
- MVT::getVectorVT(CVT, Repeat), Ld);
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+ SDVTList Tys =
+ DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ SDValue Brdcst = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
+ MachineMemOperand::MOLoad);
return DAG.getBitcast(VT, Brdcst);
- } else if (SplatBitSize > 64) {
+ }
+ if (SplatBitSize > 64) {
// Load the vector of constants and broadcast it.
MVT CVT = VT.getScalarType();
Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
*Ctx);
SDValue VCP = DAG.getConstantPool(VecC, PVT);
unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
- unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+ Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
Ld = DAG.getLoad(
MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
@@ -8560,10 +8784,12 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
bool ConstSplatVal =
(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
+ bool IsLoad = ISD::isNormalLoad(Ld.getNode());
// Make sure that all of the users of a non-constant load are from the
// BUILD_VECTOR node.
- if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+ // FIXME: Is the use count needed for non-constant, non-load case?
+ if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
return SDValue();
unsigned ScalarSize = Ld.getValueSizeInBits();
@@ -8603,18 +8829,17 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue CP =
DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
- unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
- Ld = DAG.getLoad(
- CVT, dl, DAG.getEntryNode(), CP,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- Alignment);
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CP};
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
+ MPI, Alignment, MachineMemOperand::MOLoad);
}
}
- bool IsLoad = ISD::isNormalLoad(Ld.getNode());
-
// Handle AVX2 in-register broadcasts.
if (!IsLoad && Subtarget.hasInt256() &&
(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
@@ -8624,15 +8849,34 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
if (!IsLoad)
return SDValue();
+ // Make sure the non-chain result is only used by this build vector.
+ if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
+ return SDValue();
+
if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
- (Subtarget.hasVLX() && ScalarSize == 64))
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ (Subtarget.hasVLX() && ScalarSize == 64)) {
+ auto *LN = cast<LoadSDNode>(Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue BCast =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+ return BCast;
+ }
// The integer check is needed for the 64-bit into 128-bit so it doesn't match
// double since there is no vbroadcastsd xmm
- if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
- if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
- return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+ if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
+ (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
+ auto *LN = cast<LoadSDNode>(Ld);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue BCast =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+ return BCast;
}
// Unsupported broadcast.
@@ -8746,20 +8990,6 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
return NV;
}
-static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
- assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
- Op.getScalarValueSizeInBits() == 1 &&
- "Can not convert non-constant vector");
- uint64_t Immediate = 0;
- for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
- SDValue In = Op.getOperand(idx);
- if (!In.isUndef())
- Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
- }
- SDLoc dl(Op);
- MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
- return DAG.getConstant(Immediate, dl, VT);
-}
// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -8782,11 +9012,11 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
SDValue In = Op.getOperand(idx);
if (In.isUndef())
continue;
- if (!isa<ConstantSDNode>(In))
- NonConstIdx.push_back(idx);
- else {
- Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
+ if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
+ Immediate |= (InC->getZExtValue() & 0x1) << idx;
HasConstElts = true;
+ } else {
+ NonConstIdx.push_back(idx);
}
if (SplatIdx < 0)
SplatIdx = idx;
@@ -8805,9 +9035,24 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
if (Cond.getOpcode() != ISD::SETCC)
Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
DAG.getConstant(1, dl, MVT::i8));
- return DAG.getSelect(dl, VT, Cond,
- DAG.getConstant(1, dl, VT),
- DAG.getConstant(0, dl, VT));
+
+ // Perform the select in the scalar domain so we can use cmov.
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
+ DAG.getAllOnesConstant(dl, MVT::i32),
+ DAG.getConstant(0, dl, MVT::i32));
+ Select = DAG.getBitcast(MVT::v32i1, Select);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
+ } else {
+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
+ SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
+ DAG.getAllOnesConstant(dl, ImmVT),
+ DAG.getConstant(0, dl, ImmVT));
+ MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+ Select = DAG.getBitcast(VecVT, Select);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
+ DAG.getIntPtrConstant(0, dl));
+ }
}
// insert elements one by one
@@ -8907,8 +9152,8 @@ static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
if (!CanFold)
break;
- unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
- unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+ unsigned I0 = Op0.getConstantOperandVal(1);
+ unsigned I1 = Op1.getConstantOperandVal(1);
if (i * 2 < NumElts) {
if (V0.isUndef()) {
@@ -9056,11 +9301,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
- !isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
return false;
- unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ unsigned I0 = Op0.getConstantOperandVal(1);
if (I0 != i)
return false;
@@ -9445,6 +9689,9 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
return SDValue();
}
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG);
+
/// If a BUILD_VECTOR's source elements all apply the same bit operation and
/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
/// just apply the bit to the vectors.
@@ -9452,6 +9699,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
/// from this, but enough scalar bit operations are created from the later
/// legalization + scalarization stages to need basic support.
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+ const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
MVT VT = Op->getSimpleValueType(0);
@@ -9515,7 +9763,14 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
- return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
+
+ if (!IsShift)
+ return Res;
+
+ // Immediately lower the shift to ensure the constant build vector doesn't
+ // get converted to a constant pool before the shift is lowered.
+ return LowerShift(Res, Subtarget, DAG);
}
/// Create a vector constant without a load. SSE/AVX provide the bare minimum
@@ -9571,9 +9826,11 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
Subtarget, DAG, SDLoc(IndicesVec));
- return extractSubVector(
- createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
- DAG, DL, SizeInBits);
+ SDValue NewSrcVec =
+ createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
+ if (NewSrcVec)
+ return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
+ return SDValue();
} else if (SrcVec.getValueSizeInBits() < SizeInBits) {
// Widen smaller SrcVec to match VT.
SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
@@ -9869,7 +10126,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return HorizontalOp;
if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
return Broadcast;
- if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
return BitOp;
unsigned EVTBits = EltVT.getSizeInBits();
@@ -9929,7 +10186,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
assert(!VarElt.getNode() && !InsIndex.getNode() &&
"Expected one variable element in this vector");
VarElt = Elt;
- InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
+ InsIndex = DAG.getVectorIdxConstant(i, dl);
}
}
Constant *CV = ConstantVector::get(ConstVecOps);
@@ -10929,6 +11186,71 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}
+/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
+/// followed by unpack 256-bit.
+static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG) {
+ SmallVector<int, 32> Unpckl, Unpckh;
+ createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
+ createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
+
+ unsigned UnpackOpcode;
+ if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+ UnpackOpcode = X86ISD::UNPCKL;
+ else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+ UnpackOpcode = X86ISD::UNPCKH;
+ else
+ return SDValue();
+
+ // This is a "natural" unpack operation (rather than the 128-bit sectored
+ // operation implemented by AVX). We need to rearrange 64-bit chunks of the
+ // input in order to use the x86 instruction.
+ V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
+ DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
+ V1 = DAG.getBitcast(VT, V1);
+ return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
+}
+
+// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
+// source into the lower elements and zeroing the upper elements.
+// TODO: Merge with matchShuffleAsVPMOV.
+static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ return false;
+
+ unsigned NumElts = Mask.size();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
+ unsigned NumSrcElts = NumElts / Scale;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
+ continue;
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+ SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
+ SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
+ DstVT = MVT::getIntegerVT(EltSizeInBits);
+ if ((NumSrcElts * EltSizeInBits) >= 128) {
+ // ISD::TRUNCATE
+ DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
+ } else {
+ // X86ISD::VTRUNC
+ DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
+ }
+ return true;
+ }
+
+ return false;
+}
+
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
@@ -11022,22 +11344,93 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
}
+/// Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
+/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
+/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
+/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
+/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+ bool IsSingleInput) {
+ // The modulus for the shuffle vector entries is based on whether this is
+ // a single input or not.
+ int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+ assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+ "We should only be called with masks with a power-of-2 size!");
+
+ uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+ // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+ // and 2^3 simultaneously. This is because we may have ambiguity with
+ // partially undef inputs.
+ bool ViableForN[3] = {true, true, true};
+
+ for (int i = 0, e = Mask.size(); i < e; ++i) {
+ // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+ // want.
+ if (Mask[i] < 0)
+ continue;
+
+ bool IsAnyViable = false;
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j]) {
+ uint64_t N = j + 1;
+
+ // The shuffle mask must be equal to (i * 2^N) % M.
+ if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+ IsAnyViable = true;
+ else
+ ViableForN[j] = false;
+ }
+ // Early exit if we exhaust the possible powers of two.
+ if (!IsAnyViable)
+ break;
+ }
+
+ for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+ if (ViableForN[j])
+ return j + 1;
+
+ // Return 0 as there is no viable power of two.
+ return 0;
+}
+
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
+// Checks for compaction shuffle masks if MaxStages > 1.
+// TODO: Add support for matching multiple PACKSS/PACKUS stages.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
unsigned &PackOpcode, ArrayRef<int> TargetMask,
SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ unsigned MaxStages = 1) {
unsigned NumElts = VT.getVectorNumElements();
unsigned BitSize = VT.getScalarSizeInBits();
- MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
- MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
+ assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
+ "Illegal maximum compaction");
- auto MatchPACK = [&](SDValue N1, SDValue N2) {
+ auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
+ unsigned NumSrcBits = PackVT.getScalarSizeInBits();
+ unsigned NumPackedBits = NumSrcBits - BitSize;
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
- if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
- APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
+ if (Subtarget.hasSSE41() || BitSize == 8) {
+ APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
(N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
V1 = VV1;
@@ -11047,8 +11440,8 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
return true;
}
}
- if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
- (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+ if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
+ (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
V1 = VV1;
V2 = VV2;
SrcVT = PackVT;
@@ -11058,19 +11451,25 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
return false;
};
- // Try binary shuffle.
- SmallVector<int, 32> BinaryMask;
- createPackShuffleMask(VT, BinaryMask, false);
- if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
- if (MatchPACK(V1, V2))
- return true;
+ // Attempt to match against wider and wider compaction patterns.
+ for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
+ MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
+ MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
- // Try unary shuffle.
- SmallVector<int, 32> UnaryMask;
- createPackShuffleMask(VT, UnaryMask, true);
- if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
- if (MatchPACK(V1, V1))
- return true;
+ // Try binary shuffle.
+ SmallVector<int, 32> BinaryMask;
+ createPackShuffleMask(VT, BinaryMask, false, NumStages);
+ if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
+ if (MatchPACK(V1, V2, PackVT))
+ return true;
+
+ // Try unary shuffle.
+ SmallVector<int, 32> UnaryMask;
+ createPackShuffleMask(VT, UnaryMask, true, NumStages);
+ if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
+ if (MatchPACK(V1, V1, PackVT))
+ return true;
+ }
return false;
}
@@ -11080,12 +11479,44 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
- if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
- Subtarget))
- return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
- DAG.getBitcast(PackVT, V2));
+ unsigned SizeBits = VT.getSizeInBits();
+ unsigned EltBits = VT.getScalarSizeInBits();
+ unsigned MaxStages = Log2_32(64 / EltBits);
+ if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
+ Subtarget, MaxStages))
+ return SDValue();
- return SDValue();
+ unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
+ unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
+
+ // Don't lower multi-stage packs on AVX512, truncation is better.
+ if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
+ return SDValue();
+
+ // Pack to the largest type possible:
+ // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
+ unsigned MaxPackBits = 16;
+ if (CurrentEltBits > 16 &&
+ (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
+ MaxPackBits = 32;
+
+ // Repeatedly pack down to the target size.
+ SDValue Res;
+ for (unsigned i = 0; i != NumStages; ++i) {
+ unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
+ unsigned NumSrcElts = SizeBits / SrcEltBits;
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+ MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
+ Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
+ DAG.getBitcast(SrcVT, V2));
+ V1 = V2 = Res;
+ CurrentEltBits /= 2;
+ }
+ assert(Res && Res.getValueType() == VT &&
+ "Failed to lower compaction shuffle");
+ return Res;
}
/// Try to emit a bitmask instruction for a shuffle.
@@ -11109,8 +11540,9 @@ static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
MVT LogicVT = VT;
if (EltVT == MVT::f32 || EltVT == MVT::f64) {
Zero = DAG.getConstantFP(0.0, DL, EltVT);
- AllOnes = DAG.getConstantFP(
- APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
+ APFloat AllOnesValue = APFloat::getAllOnesValue(
+ SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
+ AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
LogicVT =
MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
} else {
@@ -11312,6 +11744,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
}
+ // If we have VPTERNLOG, we can use that as a bit blend.
+ if (Subtarget.hasVLX())
+ if (SDValue BitBlend =
+ lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return BitBlend;
+
// Scale the blend by the number of bytes per element.
int Scale = VT.getScalarSizeInBits() / 8;
@@ -11622,10 +12060,101 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend(
return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
}
-/// Try to lower a vector shuffle as a rotation.
+/// Try to lower a vector shuffle as a bit rotation.
+///
+/// Look for a repeated rotation pattern in each sub group.
+/// Returns a ISD::ROTL element rotation amount or -1 if failed.
+static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
+ int NumElts = Mask.size();
+ assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
+
+ int RotateAmt = -1;
+ for (int i = 0; i != NumElts; i += NumSubElts) {
+ for (int j = 0; j != NumSubElts; ++j) {
+ int M = Mask[i + j];
+ if (M < 0)
+ continue;
+ if (!isInRange(M, i, i + NumSubElts))
+ return -1;
+ int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
+ if (0 <= RotateAmt && Offset != RotateAmt)
+ return -1;
+ RotateAmt = Offset;
+ }
+ }
+ return RotateAmt;
+}
+
+static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
+ const X86Subtarget &Subtarget,
+ ArrayRef<int> Mask) {
+ assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+ assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
+
+ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
+ int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
+ int MaxSubElts = 64 / EltSizeInBits;
+ for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
+ int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
+ if (RotateAmt < 0)
+ continue;
+
+ int NumElts = Mask.size();
+ MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
+ RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
+ return RotateAmt * EltSizeInBits;
+ }
+
+ return -1;
+}
+
+/// Lower shuffle using X86ISD::VROTLI rotations.
+static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
+ ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ // Only XOP + AVX512 targets have bit rotation instructions.
+ // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
+ bool IsLegal =
+ (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
+ if (!IsLegal && Subtarget.hasSSE3())
+ return SDValue();
+
+ MVT RotateVT;
+ int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
+ Subtarget, Mask);
+ if (RotateAmt < 0)
+ return SDValue();
+
+ // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
+ // expanded to OR(SRL,SHL), will be more efficient, but if they can
+ // widen to vXi16 or more then existing lowering should will be better.
+ if (!IsLegal) {
+ if ((RotateAmt % 16) == 0)
+ return SDValue();
+ // TODO: Use getTargetVShiftByConstNode.
+ unsigned ShlAmt = RotateAmt;
+ unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
+ V1 = DAG.getBitcast(RotateVT, V1);
+ SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
+ SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
+ DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
+ SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
+ return DAG.getBitcast(VT, Rot);
+ }
+
+ SDValue Rot =
+ DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
+ DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ return DAG.getBitcast(VT, Rot);
+}
+
+/// Try to match a vector shuffle as an element rotation.
///
/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
-static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
+static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask) {
int NumElts = Mask.size();
// We need to detect various ways of spelling a rotation:
@@ -11712,7 +12241,7 @@ static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
ArrayRef<int> Mask) {
// Don't accept any shuffles with zero elements.
- if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+ if (isAnyZero(Mask))
return -1;
// PALIGNR works on 128-bit lanes.
@@ -11720,7 +12249,7 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
return -1;
- int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
+ int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
if (Rotation <= 0)
return -1;
@@ -11788,7 +12317,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
/// elements, and takes the low elements as the result. Note that while this is
/// specified as a *right shift* because x86 is little-endian, it is a *left
/// rotate* of the vector lanes.
-static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
+static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
@@ -11800,7 +12329,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
&& "VLX required for 128/256-bit vectors");
SDValue Lo = V1, Hi = V2;
- int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
+ int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
if (Rotation <= 0)
return SDValue();
@@ -12566,13 +13095,13 @@ static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
assert(Subtarget.hasAVX2() &&
"We can only lower integer broadcasts with AVX2!");
- EVT EltVT = VT.getVectorElementType();
- EVT V0VT = V0.getValueType();
+ MVT EltVT = VT.getVectorElementType();
+ MVT V0VT = V0.getSimpleValueType();
assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
- EVT V0EltVT = V0VT.getVectorElementType();
+ MVT V0EltVT = V0VT.getVectorElementType();
if (!V0EltVT.isInteger())
return SDValue();
@@ -12636,7 +13165,7 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
SDValue N1, ArrayRef<int> Mask,
SelectionDAG &DAG) {
- EVT VT = N0.getValueType();
+ MVT VT = N0.getSimpleValueType();
assert((VT.is128BitVector() &&
(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
"VPERM* family of shuffles requires 32-bit or 64-bit elements");
@@ -12649,9 +13178,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
return SDValue();
SDValue WideVec = N0.getOperand(0);
- EVT WideVT = WideVec.getValueType();
- if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
- !isa<ConstantSDNode>(N1.getOperand(1)))
+ MVT WideVT = WideVec.getSimpleValueType();
+ if (!WideVT.is256BitVector())
return SDValue();
// Match extracts of each half of the wide source vector. Commute the shuffle
@@ -12699,7 +13227,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
- unsigned NumElts = Mask.size();
unsigned NumEltBits = VT.getScalarSizeInBits();
unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
? X86ISD::MOVDDUP
@@ -12707,15 +13234,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
// Check that the mask is a broadcast.
- int BroadcastIdx = -1;
- for (int i = 0; i != (int)NumElts; ++i) {
- SmallVector<int, 8> BroadcastMask(NumElts, i);
- if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
- BroadcastIdx = i;
- break;
- }
- }
-
+ int BroadcastIdx = getSplatIndex(Mask);
if (BroadcastIdx < 0)
return SDValue();
assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
@@ -12724,6 +13243,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
+ // TODO: Combine this logic with findEltLoadSrc() used by
+ // EltsFromConsecutiveLoads().
int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
@@ -12739,14 +13260,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
BitOffset %= OpBitWidth;
continue;
}
+ case ISD::EXTRACT_SUBVECTOR: {
+ // The extraction index adds to the existing offset.
+ unsigned EltBitWidth = V.getScalarValueSizeInBits();
+ unsigned Idx = V.getConstantOperandVal(1);
+ unsigned BeginOffset = Idx * EltBitWidth;
+ BitOffset += BeginOffset;
+ V = V.getOperand(0);
+ continue;
+ }
case ISD::INSERT_SUBVECTOR: {
SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
- auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
- if (!ConstantIdx)
- break;
-
int EltBitWidth = VOuter.getScalarValueSizeInBits();
- int Idx = (int)ConstantIdx->getZExtValue();
+ int Idx = (int)V.getConstantOperandVal(2);
int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
int BeginOffset = Idx * EltBitWidth;
int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
@@ -12777,8 +13303,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
DL, VT, V, BroadcastIdx, Subtarget, DAG))
return TruncBroadcast;
- MVT BroadcastVT = VT;
-
// Also check the simpler case, where we can directly reuse the scalar.
if (!BitCastSrc &&
((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
@@ -12788,23 +13312,34 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// If we can't broadcast from a register, check that the input is a load.
if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
return SDValue();
- } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
- // 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
- BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
- Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
- ? X86ISD::MOVDDUP
- : Opcode;
- }
+ } else if (ISD::isNormalLoad(V.getNode()) &&
+ cast<LoadSDNode>(V)->isSimple()) {
+ // We do not check for one-use of the vector load because a broadcast load
+ // is expected to be a win for code size, register pressure, and possibly
+ // uops even if the original vector load is not eliminated.
- // If we are broadcasting a load that is only used by the shuffle
- // then we can reduce the vector load to the broadcasted scalar load.
+ // Reduce the vector load and shuffle to a broadcasted scalar load.
LoadSDNode *Ld = cast<LoadSDNode>(V);
SDValue BaseAddr = Ld->getOperand(1);
- EVT SVT = BroadcastVT.getScalarType();
+ MVT SVT = VT.getScalarType();
unsigned Offset = BroadcastIdx * SVT.getStoreSize();
assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+
+ // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
+ // than MOVDDUP.
+ // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
+ if (Opcode == X86ISD::VBROADCAST) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), NewAddr};
+ V = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
+ return DAG.getBitcast(VT, V);
+ }
+ assert(SVT == MVT::f64 && "Unexpected VT!");
V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
DAG.getMachineFunction().getMachineMemOperand(
Ld->getMemOperand(), Offset, SVT.getStoreSize()));
@@ -12839,38 +13374,26 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
DAG.getBitcast(MVT::f64, V));
- // Bitcast back to the same scalar type as BroadcastVT.
- if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
- assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
- "Unexpected vector element size");
- MVT ExtVT;
- if (V.getValueType().isVector()) {
- unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
- ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
- } else {
- ExtVT = BroadcastVT.getScalarType();
- }
- V = DAG.getBitcast(ExtVT, V);
- }
-
- // 32-bit targets need to load i64 as a f64 and then bitcast the result.
- if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
- V = DAG.getBitcast(MVT::f64, V);
- unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
- BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
+ // If this is a scalar, do the broadcast on this type and bitcast.
+ if (!V.getValueType().isVector()) {
+ assert(V.getScalarValueSizeInBits() == NumEltBits &&
+ "Unexpected scalar size");
+ MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
+ VT.getVectorNumElements());
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
// 128-bits, removing as many bitcasts as possible.
- if (V.getValueSizeInBits() > 128) {
- MVT ExtVT = V.getSimpleValueType().getScalarType();
- ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
+ if (V.getValueSizeInBits() > 128)
V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
- V = DAG.getBitcast(ExtVT, V);
- }
- return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
+ // Otherwise cast V to a vector with the same element type as VT, but
+ // possibly narrower than VT. Then perform the broadcast.
+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
+ return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
}
// Check for whether we can use INSERTPS to perform the shuffle. We only use
@@ -13259,7 +13782,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -13293,8 +13816,7 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
- int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
-
+ SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
@@ -13548,7 +14070,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget.hasSSSE3()) {
if (Subtarget.hasVLX())
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -14186,6 +14708,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG))
return Broadcast;
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
@@ -14262,6 +14789,29 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return V;
+ // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
+ // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
+ // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
+ if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
+ !Subtarget.hasVLX()) {
+ SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
+ for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
+ DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
+ SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
+ DWordClearMask);
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
+ DWordClearMask);
+ // Now pack things back together.
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
+ if (NumEvenDrops == 2) {
+ Result = DAG.getBitcast(MVT::v4i32, Result);
+ Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
+ }
+ return Result;
+ }
+
// Try to lower by permuting the inputs into an unpack instruction.
if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
Mask, Subtarget, DAG))
@@ -14281,72 +14831,6 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG);
}
-/// Check whether a compaction lowering can be done by dropping even
-/// elements and compute how many times even elements must be dropped.
-///
-/// This handles shuffles which take every Nth element where N is a power of
-/// two. Example shuffle masks:
-///
-/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
-/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
-/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
-/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
-/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
-///
-/// Any of these lanes can of course be undef.
-///
-/// This routine only supports N <= 3.
-/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
-/// for larger N.
-///
-/// \returns N above, or the number of times even elements must be dropped if
-/// there is such a number. Otherwise returns zero.
-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
- bool IsSingleInput) {
- // The modulus for the shuffle vector entries is based on whether this is
- // a single input or not.
- int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
- assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
- "We should only be called with masks with a power-of-2 size!");
-
- uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
-
- // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
- // and 2^3 simultaneously. This is because we may have ambiguity with
- // partially undef inputs.
- bool ViableForN[3] = {true, true, true};
-
- for (int i = 0, e = Mask.size(); i < e; ++i) {
- // Ignore undef lanes, we'll optimistically collapse them to the pattern we
- // want.
- if (Mask[i] < 0)
- continue;
-
- bool IsAnyViable = false;
- for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
- if (ViableForN[j]) {
- uint64_t N = j + 1;
-
- // The shuffle mask must be equal to (i * 2^N) % M.
- if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
- IsAnyViable = true;
- else
- ViableForN[j] = false;
- }
- // Early exit if we exhaust the possible powers of two.
- if (!IsAnyViable)
- break;
- }
-
- for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
- if (ViableForN[j])
- return j + 1;
-
- // Return 0 as there is no viable power of two.
- return 0;
-}
-
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
@@ -14410,6 +14894,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Mask, Subtarget, DAG))
return Broadcast;
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
+ Subtarget, DAG))
+ return Rotate;
+
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
return V;
@@ -14524,6 +15013,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return V;
+ // Check for compaction patterns.
+ bool IsSingleInput = V2.isUndef();
+ int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
+
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
// with PSHUFB. It is important to do this before we attempt to generate any
// blends but after all of the single-input lowerings. If the single input
@@ -14534,10 +15027,13 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// and there are *very* few patterns that would actually be faster than the
// PSHUFB approach because of its ability to zero lanes.
//
+ // If the mask is a binary compaction, we can more efficiently perform this
+ // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
+ //
// FIXME: The only exceptions to the above are blends which are exact
// interleavings with direct instructions supporting them. We currently don't
// handle those well here.
- if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
bool V1InUse = false;
bool V2InUse = false;
@@ -14595,8 +15091,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// We special case these as they can be particularly efficiently handled with
// the PACKUSB instruction on x86 and they show up in common patterns of
// rearranging bytes to truncate wide elements.
- bool IsSingleInput = V2.isUndef();
- if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
+ if (NumEvenDrops) {
// NumEvenDrops is the power of two stride of the elements. Another way of
// thinking about it is that we need to drop the even elements this many
// times to get the original input.
@@ -14604,23 +15099,23 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// First we need to zero all the dropped bytes.
assert(NumEvenDrops <= 3 &&
"No support for dropping even elements more than 3 times.");
- SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
- for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
- ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
- SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
- V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+ SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
+ for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
+ WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
+ SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
+ V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
+ WordClearMask);
if (!IsSingleInput)
- V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+ V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
+ WordClearMask);
// Now pack things back together.
- V1 = DAG.getBitcast(MVT::v8i16, V1);
- V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
- SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+ SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
+ IsSingleInput ? V1 : V2);
for (int i = 1; i < NumEvenDrops; ++i) {
Result = DAG.getBitcast(MVT::v8i16, Result);
Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
}
-
return Result;
}
@@ -14725,37 +15220,13 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
int NumElements = VT.getVectorNumElements();
int SplitNumElements = NumElements / 2;
MVT ScalarVT = VT.getVectorElementType();
- MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+ MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
- // Rather than splitting build-vectors, just build two narrower build
- // vectors. This helps shuffling with splats and zeros.
+ // Use splitVector/extractSubVector so that split build-vectors just build two
+ // narrower build vectors. This helps shuffling with splats and zeros.
auto SplitVector = [&](SDValue V) {
- V = peekThroughBitcasts(V);
-
- MVT OrigVT = V.getSimpleValueType();
- int OrigNumElements = OrigVT.getVectorNumElements();
- int OrigSplitNumElements = OrigNumElements / 2;
- MVT OrigScalarVT = OrigVT.getVectorElementType();
- MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
-
SDValue LoV, HiV;
-
- auto *BV = dyn_cast<BuildVectorSDNode>(V);
- if (!BV) {
- LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
- DAG.getIntPtrConstant(0, DL));
- HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
- DAG.getIntPtrConstant(OrigSplitNumElements, DL));
- } else {
-
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (int i = 0; i < OrigSplitNumElements; ++i) {
- LoOps.push_back(BV->getOperand(i));
- HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
- }
- LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
- HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
- }
+ std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
return std::make_pair(DAG.getBitcast(SplitVT, LoV),
DAG.getBitcast(SplitVT, HiV));
};
@@ -15963,7 +16434,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
SmallVector<int, 2> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
SmallVector<int, 4> PSHUFDMask;
- scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
+ narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
return DAG.getBitcast(
MVT::v4i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
@@ -15984,7 +16455,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VLX support, we can use VALIGN or VEXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16085,13 +16556,14 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have a single input shuffle with different shuffle patterns in the
// two 128-bit lanes use the variable mask to VPERMILPS.
if (V2.isUndef()) {
- SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
- if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+ if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
-
- if (Subtarget.hasAVX2())
+ }
+ if (Subtarget.hasAVX2()) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
-
+ }
// Otherwise, fall back.
return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
DAG, Subtarget);
@@ -16190,7 +16662,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// If we have VLX support, we can use VALIGN or EXPAND.
if (Subtarget.hasVLX()) {
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16210,9 +16682,14 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return V;
- // If the shuffle patterns aren't repeated but it is a single input, directly
- // generate a cross-lane VPERMD instruction.
if (V2.isUndef()) {
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
+ return V;
+
+ // If the shuffle patterns aren't repeated but it's a single input, directly
+ // generate a cross-lane VPERMD instruction.
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
}
@@ -16294,6 +16771,16 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
if (V2.isUndef()) {
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
+ return V;
+
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
@@ -16379,7 +16866,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Shift;
// Try to use byte rotation instructions.
@@ -16387,6 +16874,12 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use bit rotation instructions.
+ if (V2.isUndef())
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16396,6 +16889,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
+ // Try to produce a fixed cross-128-bit lane permute followed by unpack
+ // because that should be faster than the variable permute alternatives.
+ if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
+ return V;
+
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;
@@ -16518,13 +17016,14 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
- SmallVector<int, 4> WidenedMask;
- if (!canWidenShuffleElements(Mask, WidenedMask))
+ SmallVector<int, 4> Widened128Mask;
+ if (!canWidenShuffleElements(Mask, Widened128Mask))
return SDValue();
+ assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
// Try to use an insert into a zero vector.
- if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
- (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+ if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+ (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
@@ -16536,37 +17035,34 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
// Check for patterns which can be matched with a single insert of a 256-bit
// subvector.
- bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
- {0, 1, 2, 3, 0, 1, 2, 3});
- if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
- {0, 1, 2, 3, 8, 9, 10, 11})) {
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
+ if (OnlyUsesV1 ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
- SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
- OnlyUsesV1 ? V1 : V2,
- DAG.getIntPtrConstant(0, DL));
+ SDValue SubVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
DAG.getIntPtrConstant(4, DL));
}
- assert(WidenedMask.size() == 4);
-
// See if this is an insertion of the lower 128-bits of V2 into V1.
bool IsInsert = true;
int V2Index = -1;
for (int i = 0; i < 4; ++i) {
- assert(WidenedMask[i] >= -1);
- if (WidenedMask[i] < 0)
+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Widened128Mask[i] < 0)
continue;
// Make sure all V1 subvectors are in place.
- if (WidenedMask[i] < 4) {
- if (WidenedMask[i] != i) {
+ if (Widened128Mask[i] < 4) {
+ if (Widened128Mask[i] != i) {
IsInsert = false;
break;
}
} else {
// Make sure we only have a single V2 index and its the lowest 128-bits.
- if (V2Index >= 0 || WidenedMask[i] != 4) {
+ if (V2Index >= 0 || Widened128Mask[i] != 4) {
IsInsert = false;
break;
}
@@ -16580,16 +17076,26 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
}
+ // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
+ // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
+ // possible we at least ensure the lanes stay sequential to help later
+ // combines.
+ SmallVector<int, 2> Widened256Mask;
+ if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
+ Widened128Mask.clear();
+ narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
+ }
+
// Try to lower to vshuf64x2/vshuf32x4.
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
unsigned PermMask = 0;
// Insure elements came from the same Op.
for (int i = 0; i < 4; ++i) {
- assert(WidenedMask[i] >= -1);
- if (WidenedMask[i] < 0)
+ assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Widened128Mask[i] < 0)
continue;
- SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
+ SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
unsigned OpIndex = i / 2;
if (Ops[OpIndex].isUndef())
Ops[OpIndex] = Op;
@@ -16598,7 +17104,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
// Convert the 128-bit shuffle mask selection values into 128-bit selection
// bits defined by a vshuf64x2 instruction's immediate control byte.
- PermMask |= (WidenedMask[i] % 4) << (i * 2);
+ PermMask |= (Widened128Mask[i] % 4) << (i * 2);
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
@@ -16696,6 +17202,12 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// If we have a single input shuffle with different shuffle patterns in the
// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
if (V2.isUndef() &&
@@ -16728,7 +17240,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
SmallVector<int, 2> Repeated128Mask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
SmallVector<int, 4> PSHUFDMask;
- scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
+ narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
return DAG.getBitcast(
MVT::v8i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
@@ -16752,7 +17264,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16814,7 +17326,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Shift;
// Try to use VALIGN.
- if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
+ if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
Subtarget, DAG))
return Rotate;
@@ -16833,6 +17345,13 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
CastV1, CastV2, DAG);
return DAG.getBitcast(MVT::v16i32, ShufPS);
}
+
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
DAG, Subtarget))
@@ -16841,6 +17360,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
Zeroable, Subtarget, DAG))
return Blend;
+
return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
}
@@ -16865,6 +17385,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
return V;
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V =
+ lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -16876,18 +17401,23 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return Rotate;
if (V2.isUndef()) {
+ // Try to use bit rotation instructions.
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
// As this is a single-input shuffle, the repeated mask should be
// a strictly valid v8i16 mask that we can pass through to the v8i16
// lowering to handle even the v32 case.
- return lowerV8I16GeneralSingleInputShuffle(
- DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+ return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
+ RepeatedMask, Subtarget, DAG);
}
}
if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
+ Zeroable, Subtarget, DAG))
return Blend;
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
@@ -16933,6 +17463,17 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Rotate;
+ // Try to use bit rotation instructions.
+ if (V2.isUndef())
+ if (SDValue Rotate =
+ lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
+ return Rotate;
+
+ // Lower as AND if possible.
+ if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return Masked;
+
if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
Zeroable, Subtarget, DAG))
return PSHUFB;
@@ -16995,6 +17536,18 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Broadcast;
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
+ // Try using bit ops for masking and blending before falling back to
+ // splitting.
+ if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+ if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+ return V;
+
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
+
// Dispatch to each element type for lowering. If we don't have support for
// specific element type shuffles at 512 bits, immediately split them and
// lower them. Each lowering routine of a given type is allowed to assume that
@@ -17477,6 +18030,10 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
unsigned EltSize = VT.getScalarSizeInBits();
unsigned NumElts = VT.getVectorNumElements();
+ // Expand v32i16/v64i8 without BWI.
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return SDValue();
+
// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
// into an i1 condition so that we can use the mask-based 512-bit blend
// instructions.
@@ -17532,14 +18089,24 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
+ SDValue Vec = Op.getOperand(0);
+ SDValue Idx = Op.getOperand(1);
+ assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
SDLoc dl(Op);
- if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+ if (!Vec.getSimpleValueType().is128BitVector())
return SDValue();
if (VT.getSizeInBits() == 8) {
- SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
- Op.getOperand(0), Op.getOperand(1));
+ // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
+ // we're going to zero extend the register or fold the store.
+ if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
+ !MayFoldIntoStore(Op))
+ return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -17552,22 +18119,17 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
if (!Op.hasOneUse())
return SDValue();
SDNode *User = *Op.getNode()->use_begin();
- if ((User->getOpcode() != ISD::STORE ||
- isNullConstant(Op.getOperand(1))) &&
+ if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
(User->getOpcode() != ISD::BITCAST ||
User->getValueType(0) != MVT::i32))
return SDValue();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
- DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
- Op.getOperand(1));
+ DAG.getBitcast(MVT::v4i32, Vec), Idx);
return DAG.getBitcast(MVT::f32, Extract);
}
- if (VT == MVT::i32 || VT == MVT::i64) {
- // ExtractPS/pextrq works with constant index.
- if (isa<ConstantSDNode>(Op.getOperand(1)))
+ if (VT == MVT::i32 || VT == MVT::i64)
return Op;
- }
return SDValue();
}
@@ -17580,6 +18142,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Vec);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
MVT EltVT = Op.getSimpleValueType();
assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
@@ -17587,7 +18150,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
// variable index can't be handled in mask registers,
// extend vector to VR512/128
- if (!isa<ConstantSDNode>(Idx)) {
+ if (!IdxC) {
unsigned NumElts = VecVT.getVectorNumElements();
// Extending v8i1/v16i1 to 512-bit get better performance on KNL
// than extending to 128/256bit.
@@ -17598,7 +18161,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
}
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned IdxVal = IdxC->getZExtValue();
if (IdxVal == 0) // the operation is legal
return Op;
@@ -17627,11 +18190,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Vec = Op.getOperand(0);
MVT VecVT = Vec.getSimpleValueType();
SDValue Idx = Op.getOperand(1);
+ auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
if (VecVT.getVectorElementType() == MVT::i1)
return ExtractBitFromMaskVector(Op, DAG, Subtarget);
- if (!isa<ConstantSDNode>(Idx)) {
+ if (!IdxC) {
// Its more profitable to go through memory (1 cycles throughput)
// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
// IACA tool was used to get performance estimation
@@ -17665,7 +18229,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return SDValue();
}
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned IdxVal = IdxC->getZExtValue();
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
@@ -17697,9 +18261,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
DAG.getBitcast(MVT::v4i32, Vec), Idx));
- // Transform it so it match pextrw which produces a 32-bit result.
- SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
- Op.getOperand(0), Op.getOperand(1));
+ SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
@@ -17789,9 +18351,7 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
// Copy into a k-register, extract to v1i1 and insert_subvector.
SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
-
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
- Op.getOperand(2));
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
}
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
@@ -17864,11 +18424,22 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
// This will be just movd/movq/movss/movsd.
- if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
- (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
- EltVT == MVT::i64)) {
- N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
- return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
+ if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+ EltVT == MVT::i64) {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+ return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ }
+
+ // We can't directly insert an i8 or i16 into a vector, so zero extend
+ // it to i32 first.
+ if (EltVT == MVT::i16 || EltVT == MVT::i8) {
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
+ MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
+ N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+ return DAG.getBitcast(VT, N1);
+ }
}
// Transform it so it match pinsr{b,w} which expects a GR32 as its second
@@ -17981,12 +18552,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
- SDValue Idx = Op.getOperand(1);
-
- if (!isa<ConstantSDNode>(Idx))
- return SDValue();
+ uint64_t IdxVal = Op.getConstantOperandVal(1);
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0) // the operation is legal
return Op;
@@ -18045,7 +18612,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Result = DAG.getTargetConstantPool(
- CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
+ CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
SDLoc DL(CP);
Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
// With PIC, the address is actually $g + Offset.
@@ -18554,25 +19121,47 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
Op0, Op1, Amt);
}
-
- assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
- "Unexpected funnel shift type!");
+ assert(
+ (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+ "Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
bool OptForSize = DAG.shouldOptForSize();
- if (!OptForSize && Subtarget.isSHLDSlow())
- return SDValue();
+ bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
- if (IsFSHR)
- std::swap(Op0, Op1);
+ // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+ // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
+ if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
+ !isa<ConstantSDNode>(Amt)) {
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
+ SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
+ Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
+ Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
+ Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
+ SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
+ if (IsFSHR) {
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
+ } else {
+ Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
+ Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
+ }
+ return DAG.getZExtOrTrunc(Res, DL, VT);
+ }
+
+ if (VT == MVT::i8 || ExpandFunnel)
+ return SDValue();
// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
- if (VT == MVT::i16)
+ if (VT == MVT::i16) {
Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
DAG.getConstant(15, DL, Amt.getValueType()));
+ unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
+ return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
+ }
- unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
- return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
+ return Op;
}
// Try to use a packed vector operation to handle i64 on 32-bit targets when
@@ -18682,6 +19271,56 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, DL));
}
+/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
+/// try to vectorize the cast ops. This will avoid an expensive round-trip
+/// between XMM and GPR.
+static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: Allow FP_TO_UINT.
+ SDValue CastToInt = CastToFP.getOperand(0);
+ MVT VT = CastToFP.getSimpleValueType();
+ if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
+ return SDValue();
+
+ MVT IntVT = CastToInt.getSimpleValueType();
+ SDValue X = CastToInt.getOperand(0);
+ MVT SrcVT = X.getSimpleValueType();
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+ return SDValue();
+
+ // See if we have 128-bit vector cast instructions for this type of cast.
+ // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
+ if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
+ IntVT != MVT::i32)
+ return SDValue();
+
+ unsigned SrcSize = SrcVT.getSizeInBits();
+ unsigned IntSize = IntVT.getSizeInBits();
+ unsigned VTSize = VT.getSizeInBits();
+ MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
+ MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
+ MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
+
+ // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
+ unsigned ToIntOpcode =
+ SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
+ unsigned ToFPOpcode =
+ IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
+
+ // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
+ //
+ // We are not defining the high elements (for example, zero them) because
+ // that could nullify any performance advantage that we hoped to gain from
+ // this vector op hack. We do not expect any adverse effects (like denorm
+ // penalties) with cast ops.
+ SDLoc DL(CastToFP);
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+ SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
+ SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
+ SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
+}
+
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(Op);
@@ -18739,15 +19378,15 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
SmallVector<SDValue, 4> SignCvts(4);
SmallVector<SDValue, 4> Chains(4);
for (int i = 0; i != 4; ++i) {
- SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
DAG.getIntPtrConstant(i, DL));
if (IsStrict) {
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
- {Op.getOperand(0), Src});
+ {Op.getOperand(0), Elt});
Chains[i] = SignCvts[i].getValue(1);
} else {
- SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
}
}
SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
@@ -18784,6 +19423,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
+ if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
+ return R;
+
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
// Note: Since v2f64 is a legal type. We don't need to zero extend the
@@ -18832,21 +19474,23 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
SDValue ValueToStore = Src;
- if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())
+ if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
- unsigned Size = SrcVT.getSizeInBits()/8;
+ unsigned Size = SrcVT.getStoreSize();
+ Align Alignment(Size);
MachineFunction &MF = DAG.getMachineFunction();
auto PtrVT = getPointerTy(MF.getDataLayout());
- int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- Chain = DAG.getStore(
- Chain, dl, ValueToStore, StackSlot,
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
- std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+ Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
+ std::pair<SDValue, SDValue> Tmp =
+ BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -18854,58 +19498,40 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
return Tmp.first;
}
-std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
- SDValue StackSlot,
- SelectionDAG &DAG) const {
+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
+ EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
+ MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
// Build the FILD
- SDLoc DL(Op);
SDVTList Tys;
- bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+ bool useSSE = isScalarFPTypeInSSEReg(DstVT);
if (useSSE)
- Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+ Tys = DAG.getVTList(MVT::f80, MVT::Other);
else
- Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+ Tys = DAG.getVTList(DstVT, MVT::Other);
- unsigned ByteSize = SrcVT.getSizeInBits() / 8;
-
- FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
- MachineMemOperand *LoadMMO;
- if (FI) {
- int SSFI = FI->getIndex();
- LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
- MachineMemOperand::MOLoad, ByteSize, ByteSize);
- } else {
- LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
- StackSlot = StackSlot.getOperand(1);
- }
- SDValue FILDOps[] = {Chain, StackSlot};
+ SDValue FILDOps[] = {Chain, Pointer};
SDValue Result =
- DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
- Tys, FILDOps, SrcVT, LoadMMO);
+ DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
+ Alignment, MachineMemOperand::MOLoad);
Chain = Result.getValue(1);
if (useSSE) {
- SDValue InFlag = Result.getValue(2);
-
- // FIXME: Currently the FST is glued to the FILD_FLAG. This
- // shouldn't be necessary except that RFP cannot be live across
- // multiple blocks. When stackifier is fixed, they can be uncoupled.
MachineFunction &MF = DAG.getMachineFunction();
- unsigned SSFISize = Op.getValueSizeInBits() / 8;
- int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
+ unsigned SSFISize = DstVT.getStoreSize();
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
auto PtrVT = getPointerTy(MF.getDataLayout());
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Tys = DAG.getVTList(MVT::Other);
- SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
+ SDValue FSTOps[] = {Chain, Result, StackSlot};
MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
- MachineMemOperand::MOStore, SSFISize, SSFISize);
+ MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
- Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
- Op.getValueType(), StoreMMO);
+ Chain =
+ DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
Result = DAG.getLoad(
- Op.getValueType(), DL, Chain, StackSlot,
+ DstVT, DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
Chain = Result.getValue(1);
}
@@ -18948,7 +19574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
Constant *C0 = ConstantDataVector::get(*Context, CV0);
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
- SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
+ SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
SmallVector<Constant*,2> CV1;
CV1.push_back(
@@ -18958,7 +19584,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
- SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
+ SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
// Load the 64-bit value into an XMM register.
SDValue XR1 =
@@ -19163,13 +19789,13 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
*DAG.getContext(),
APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
- SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);
+ SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
SDValue VBias = DAG.getMemIntrinsicNode(
X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
- MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
- /*Alignment*/ 8, MachineMemOperand::MOLoad);
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
+ MachineMemOperand::MOLoad);
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
DAG.getBitcast(MVT::v4i64, VBias));
@@ -19337,15 +19963,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD.
- SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+ SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
+ int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
if (SrcVT == MVT::i32) {
SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
SDValue Store1 =
- DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());
+ DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
- OffsetSlot, MachinePointerInfo());
+ OffsetSlot, MPI.getWithOffset(4), 4);
std::pair<SDValue, SDValue> Tmp =
- BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
if (IsStrict)
return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -19361,21 +19990,17 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
}
SDValue Store =
- DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());
+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
// we must be careful to do the computation in x87 extended precision, not
// in SSE. (The generic code can't know it's OK to do this, or how to.)
- int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
- MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
- MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
- MachineMemOperand::MOLoad, 8, 8);
-
SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Store, StackSlot };
- SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
- MVT::i64, MMO);
+ SDValue Fild =
+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
+ Align(8), MachineMemOperand::MOLoad);
Chain = Fild.getValue(1);
@@ -19388,6 +20013,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
APInt FF(64, 0x5F80000000000000ULL);
SDValue FudgePtr = DAG.getConstantPool(
ConstantInt::get(*DAG.getContext(), FF), PtrVT);
+ Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
@@ -19399,7 +20025,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Fudge = DAG.getExtLoad(
ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
- /* Alignment = */ 4);
+ CPAlignment);
Chain = Fudge.getValue(1);
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
@@ -19462,7 +20088,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// stack slot.
MachineFunction &MF = DAG.getMachineFunction();
unsigned MemSize = DstTy.getStoreSize();
- int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
@@ -19537,20 +20164,20 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
if (isScalarFPTypeInSSEReg(TheVT)) {
assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
- SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Chain, StackSlot };
unsigned FLDSize = TheVT.getStoreSize();
assert(FLDSize <= MemSize && "Stack slot not big enough");
MachineMemOperand *MMO = MF.getMachineMemOperand(
- MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
+ MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
Chain = Value.getValue(1);
}
// Build the FP_TO_INT*_IN_MEM
MachineMemOperand *MMO = MF.getMachineMemOperand(
- MPI, MachineMemOperand::MOStore, MemSize, MemSize);
+ MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
SDValue Ops[] = { Chain, Value, StackSlot };
SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
DAG.getVTList(MVT::Other),
@@ -19590,14 +20217,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
- if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
- return SDValue();
-
- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- return DAG.getNode(ExtendInVecOpc, dl, VT, In);
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
}
if (Subtarget.hasInt256())
@@ -19729,7 +20351,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
"Unexpected PACK opcode");
assert(DstVT.isVector() && "VT not a vector?");
- // Requires SSE2 but AVX512 has fast vector truncate.
+ // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
if (!Subtarget.hasSSE2())
return SDValue();
@@ -19770,15 +20392,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
In = DAG.getBitcast(InVT, In);
- SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
Res = extractSubVector(Res, 0, DAG, DL, 64);
return DAG.getBitcast(DstVT, Res);
}
- // Extract lower/upper subvectors.
- unsigned NumSubElts = NumElems / 2;
- SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
- SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+ // Split lower/upper subvectors.
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(In, DAG, DL);
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
@@ -19804,7 +20425,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
SmallVector<int, 64> Mask;
int Scale = 64 / OutVT.getScalarSizeInBits();
- scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
+ narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
if (DstVT.is256BitVector())
@@ -19818,7 +20439,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
// Recursively pack lower/upper subvectors, concat result and pack again.
assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
- EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
+ EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
@@ -19865,17 +20486,22 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
// we need to split into two 8 element vectors which we can extend to v8i32,
// truncate and concat the results. There's an additional complication if
- // the original type is v16i8. In that case we can't split the v16i8 so
- // first we pre-extend it to v16i16 which we can split to v8i16, then extend
- // to v8i32, truncate that to v8i1 and concat the two halves.
+ // the original type is v16i8. In that case we can't split the v16i8
+ // directly, so we need to shuffle high elements to low and use
+ // sign_extend_vector_inreg.
if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
+ SDValue Lo, Hi;
if (InVT == MVT::v16i8) {
- // First we need to sign extend up to 256-bits so we can split that.
- InVT = MVT::v16i16;
- In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
+ Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
+ Hi = DAG.getVectorShuffle(
+ InVT, DL, In, In,
+ {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+ Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
+ } else {
+ assert(InVT == MVT::v16i16 && "Unexpected VT!");
+ Lo = extract128BitVector(In, 0, DAG, DL);
+ Hi = extract128BitVector(In, 8, DAG, DL);
}
- SDValue Lo = extract128BitVector(In, 0, DAG, DL);
- SDValue Hi = extract128BitVector(In, 8, DAG, DL);
// We're split now, just emit two truncates and a concat. The two
// truncates will trigger legalization to come back to this function.
Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
@@ -19918,7 +20544,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
if (!TLI.isTypeLegal(InVT)) {
if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
VT.is128BitVector()) {
- assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+ assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
+ "Unexpected subtarget!");
// The default behavior is to truncate one step, concatenate, and then
// truncate the remainder. We'd rather produce two 64-bit results and
// concatenate those.
@@ -19942,6 +20569,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
// vpmovqb/w/d, vpmovdb/w, vpmovwb
if (Subtarget.hasAVX512()) {
+ if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(VT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
+ }
+
// word to byte only under BWI. Otherwise we have to promoted to v16i32
// and then truncate that. But we should only do that if we haven't been
// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
@@ -20174,6 +20806,25 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
}
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
+ if (!Subtarget.hasVLX()) {
+ // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
+ // legalizer and then widened again by vector op legalization.
+ if (!IsStrict)
+ return SDValue();
+
+ SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
+ {Src, Zero, Zero, Zero});
+ Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+ {Op->getOperand(0), Tmp});
+ SDValue Chain = Tmp.getValue(1);
+ Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp, Chain}, dl);
+ return Tmp;
+ }
+
assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
DAG.getUNDEF(MVT::v2f32));
@@ -20281,6 +20932,62 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
+SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // If the source is in an SSE register, the node is Legal.
+ if (isScalarFPTypeInSSEReg(SrcVT))
+ return Op;
+
+ return LRINT_LLRINTHelper(Op.getNode(), DAG);
+}
+
+SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
+ SelectionDAG &DAG) const {
+ EVT DstVT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
+ // f16 must be promoted before using the lowering in this routine.
+ // fp128 does not use this lowering.
+ return SDValue();
+ }
+
+ SDLoc DL(N);
+ SDValue Chain = DAG.getEntryNode();
+
+ bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
+
+ // If we're converting from SSE, the stack slot needs to hold both types.
+ // Otherwise it only needs to hold the DstVT.
+ EVT OtherVT = UseSSE ? SrcVT : DstVT;
+ SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+ if (UseSSE) {
+ assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
+ Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue Ops[] = { Chain, StackPtr };
+
+ Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
+ Chain = Src.getValue(1);
+ }
+
+ SDValue StoreOps[] = { Chain, Src, StackPtr };
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
+ StoreOps, DstVT, MPI, /*Align*/ None,
+ MachineMemOperand::MOStore);
+
+ return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
+}
+
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
@@ -20333,6 +21040,67 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
return Tmp.first;
}
+static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
+ "Unexpected VT!");
+
+ SDLoc dl(Op);
+ SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
+ DAG.getConstant(0, dl, MVT::v8i16), Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
+ {Op.getOperand(0), Res});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
+}
+
+static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+ assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
+ "Unexpected VT!");
+
+ SDLoc dl(Op);
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
+ DAG.getConstantFP(0, dl, MVT::v4f32), Src,
+ DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getNode(
+ X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
+ {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
+ Chain = Res.getValue(1);
+ } else {
+ // FIXME: Should we use zeros for upper elements for non-strict?
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
+}
+
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
@@ -20413,6 +21181,30 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
}
+/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
+/// This mode isn't supported in hardware on X86. But as long as we aren't
+/// compiling with trapping math, we can emulate this with
+/// floor(X + copysign(nextafter(0.5, 0.0), X)).
+static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
+ SDValue N0 = Op.getOperand(0);
+ SDLoc dl(Op);
+ MVT VT = Op.getSimpleValueType();
+
+ // N0 += copysign(nextafter(0.5, 0.0), N0)
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
+ bool Ignored;
+ APFloat Point5Pred = APFloat(0.5f);
+ Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
+ Point5Pred.next(/*nextDown*/true);
+
+ SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
+ DAG.getConstantFP(Point5Pred, dl, VT), N0);
+ N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
+
+ // Truncate the result to remove fraction.
+ return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
+}
+
/// The only differences between FABS and FNEG are the mask and the logic op.
/// FNEG also has a folding opportunity for FNEG(FABS(x)).
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
@@ -20568,9 +21360,12 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
}
/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
-/// style scalarized (associative) reduction patterns.
+/// style scalarized (associative) reduction patterns. Partial reductions
+/// are supported when the pointer SrcMask is non-null.
+/// TODO - move this to SelectionDAG?
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
- SmallVectorImpl<SDValue> &SrcOps) {
+ SmallVectorImpl<SDValue> &SrcOps,
+ SmallVectorImpl<APInt> *SrcMask = nullptr) {
SmallVector<SDValue, 8> Opnds;
DenseMap<SDValue, APInt> SrcOpMap;
EVT VT = MVT::Other;
@@ -20598,8 +21393,8 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
return false;
// Quit if without a constant index.
- SDValue Idx = I->getOperand(1);
- if (!isa<ConstantSDNode>(Idx))
+ auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
+ if (!Idx)
return false;
SDValue Src = I->getOperand(0);
@@ -20615,61 +21410,167 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
SrcOps.push_back(Src);
}
+
// Quit if element already used.
- unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
+ unsigned CIdx = Idx->getZExtValue();
if (M->second[CIdx])
return false;
M->second.setBit(CIdx);
}
- // Quit if not all elements are used.
- for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
- E = SrcOpMap.end();
- I != E; ++I) {
- if (!I->second.isAllOnesValue())
- return false;
+ if (SrcMask) {
+ // Collect the source partial masks.
+ for (SDValue &SrcOp : SrcOps)
+ SrcMask->push_back(SrcOpMap[SrcOp]);
+ } else {
+ // Quit if not all elements are used.
+ for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
+ E = SrcOpMap.end();
+ I != E; ++I) {
+ if (!I->second.isAllOnesValue())
+ return false;
+ }
}
return true;
}
-// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+// Helper function for comparing all bits of a vector against zero.
+static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
+ const APInt &Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG, X86::CondCode &X86CC) {
+ EVT VT = V.getValueType();
+ assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
+ "Element Mask vs Vector bitwidth mismatch");
+
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
+ X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
+
+ auto MaskBits = [&](SDValue Src) {
+ if (Mask.isAllOnesValue())
+ return Src;
+ EVT SrcVT = Src.getValueType();
+ SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
+ return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
+ };
+
+ // For sub-128-bit vector, cast to (legal) integer and compare with zero.
+ if (VT.getSizeInBits() < 128) {
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
+ return SDValue();
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ DAG.getBitcast(IntVT, MaskBits(V)),
+ DAG.getConstant(0, DL, IntVT));
+ }
+
+ // Quit if not splittable to 128/256-bit vector.
+ if (!isPowerOf2_32(VT.getSizeInBits()))
+ return SDValue();
+
+ // Split down to 128/256-bit vector.
+ unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
+ while (VT.getSizeInBits() > TestSize) {
+ auto Split = DAG.SplitVector(V, DL);
+ VT = Split.first.getValueType();
+ V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
+ }
+
+ bool UsePTEST = Subtarget.hasSSE41();
+ if (UsePTEST) {
+ MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ V = DAG.getBitcast(TestVT, MaskBits(V));
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
+ }
+
+ // Without PTEST, a masked v2i64 or-reduction is not faster than
+ // scalarization.
+ if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
+ return SDValue();
+
+ V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
+ V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
+ DAG.getConstant(0xFFFF, DL, MVT::i32));
+}
+
+// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
+// CMP(MOVMSK(PCMPEQB(X,0))).
+static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+ const SDLoc &DL,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &X86CC) {
- assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+ assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
- if (!Subtarget.hasSSE41() || !Op->hasOneUse())
+ if (!Subtarget.hasSSE2() || !Op->hasOneUse())
return SDValue();
- SmallVector<SDValue, 8> VecIns;
- if (!matchScalarReduction(Op, ISD::OR, VecIns))
- return SDValue();
+ // Check whether we're masking/truncating an OR-reduction result, in which
+ // case track the masked bits.
+ APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
+ switch (Op.getOpcode()) {
+ case ISD::TRUNCATE: {
+ SDValue Src = Op.getOperand(0);
+ Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
+ Op.getScalarValueSizeInBits());
+ Op = Src;
+ break;
+ }
+ case ISD::AND: {
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+ Mask = Cst->getAPIntValue();
+ Op = Op.getOperand(0);
+ }
+ break;
+ }
+ }
- // Quit if not 128/256-bit vector.
- EVT VT = VecIns[0].getValueType();
- if (!VT.is128BitVector() && !VT.is256BitVector())
- return SDValue();
+ SmallVector<SDValue, 8> VecIns;
+ if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
+ EVT VT = VecIns[0].getValueType();
+ assert(llvm::all_of(VecIns,
+ [VT](SDValue V) { return VT == V.getValueType(); }) &&
+ "Reduction source vector mismatch");
+
+ // Quit if less than 128-bits or not splittable to 128/256-bit vector.
+ if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
+ return SDValue();
- SDLoc DL(Op);
- MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ // If more than one full vector is evaluated, OR them first before PTEST.
+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
+ Slot += 2, e += 1) {
+ // Each iteration will OR 2 nodes and append the result until there is
+ // only 1 node left, i.e. the final OR'd value of all vectors.
+ SDValue LHS = VecIns[Slot];
+ SDValue RHS = VecIns[Slot + 1];
+ VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
+ }
- // Cast all vectors into TestVT for PTEST.
- for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
- VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
+ X86::CondCode CCode;
+ if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
+ DAG, CCode)) {
+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+ return V;
+ }
+ }
- // If more than one full vector is evaluated, OR them first before PTEST.
- for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
- // Each iteration will OR 2 nodes and append the result until there is only
- // 1 node left, i.e. the final OR'd value of all vectors.
- SDValue LHS = VecIns[Slot];
- SDValue RHS = VecIns[Slot + 1];
- VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+ if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ ISD::NodeType BinOp;
+ if (SDValue Match =
+ DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
+ X86::CondCode CCode;
+ if (SDValue V =
+ LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
+ X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+ return V;
+ }
+ }
}
- X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
- DL, MVT::i8);
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
+ return SDValue();
}
/// return true if \c Op has a use that doesn't just read flags.
@@ -20814,27 +21715,14 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
-static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
- unsigned X86CC, const SDLoc &dl,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- SDValue Chain, bool IsSignaling) {
+static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
if (isNullConstant(Op1))
- return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);
+ return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
EVT CmpVT = Op0.getValueType();
- if (CmpVT.isFloatingPoint()) {
- if (Chain) {
- SDValue Res =
- DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
- dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
- return std::make_pair(Res, Res.getValue(1));
- }
- return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),
- SDValue());
- }
-
assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
@@ -20884,40 +21772,28 @@ static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
}
+ // 0-x == y --> x+y == 0
+ // 0-x != y --> x+y != 0
+ if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
+ Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
+ return Add.getValue(1);
+ }
+
+ // x == 0-y --> x+y == 0
+ // x != 0-y --> x+y != 0
+ if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
+ Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+ SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
+ return Add.getValue(1);
+ }
+
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return std::make_pair(Sub.getValue(1), SDValue());
-}
-
-/// Convert a comparison if required by the subtarget.
-SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
- SelectionDAG &DAG) const {
- // If the subtarget does not support the FUCOMI instruction, floating-point
- // comparisons have to be converted.
- bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;
- bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP ||
- Cmp.getOpcode() == X86ISD::STRICT_FCMPS;
-
- if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) ||
- !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() ||
- !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())
- return Cmp;
-
- // The instruction selector will select an FUCOM instruction instead of
- // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
- // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
- // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))
- SDLoc dl(Cmp);
- SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
- SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
- SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
- DAG.getConstant(8, dl, MVT::i8));
- SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
-
- // Some 64-bit targets lack SAHF support, but they do support FCOMI.
- assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
- return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+ return Sub.getValue(1);
}
/// Check if replacement of SQRT with RSQRT should be disabled.
@@ -21211,32 +22087,30 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
/// concatenate the result back.
-static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
+static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
- assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
- "Unsupported value type for operation");
+ assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
+ assert(Op.getOperand(0).getValueType().isInteger() &&
+ VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");
- unsigned NumElems = VT.getVectorNumElements();
SDLoc dl(Op);
SDValue CC = Op.getOperand(2);
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
+ // Extract the LHS Lo/Hi vectors
+ SDValue LHS1, LHS2;
+ std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
- // Extract the RHS vectors
- SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+ // Extract the RHS Lo/Hi vectors
+ SDValue RHS1, RHS2;
+ std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
- MVT EltVT = VT.getVectorElementType();
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+ DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
+ DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
}
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -21369,8 +22243,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ // If we have a strict compare with a vXi1 result and the input is 128/256
+ // bits we can't use a masked compare unless we have VLX. If we use a wider
+ // compare like we do for non-strict, we might trigger spurious exceptions
+ // from the upper elements. Instead emit a AVX compare and convert to mask.
unsigned Opc;
- if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+ if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
+ (!IsStrict || Subtarget.hasVLX() ||
+ Op0.getSimpleValueType().is512BitVector())) {
assert(VT.getVectorNumElements() <= 16);
Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
} else {
@@ -21466,10 +22346,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
- // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
- // result type of SETCC. The bitcast is expected to be optimized away
- // during combining/isel.
- Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+ if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
+ // We emitted a compare with an XMM/YMM result. Finish converting to a
+ // mask register using a vptestm.
+ EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
+ Cmp = DAG.getBitcast(CastVT, Cmp);
+ Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
+ DAG.getConstant(0, dl, CastVT), ISD::SETNE);
+ } else {
+ // If this is SSE/AVX CMPP, bitcast the result back to integer to match
+ // the result type of SETCC. The bitcast is expected to be optimized
+ // away during combining/isel.
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+ }
if (IsStrict)
return DAG.getMergeValues({Cmp, Chain}, dl);
@@ -21563,7 +22452,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntVSETCC(Op, DAG);
+ return splitIntVSETCC(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8) {
+ assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
+ return splitIntVSETCC(Op, DAG);
+ }
// If this is a SETNE against the signed minimum value, change it to SETGT.
// If this is a SETNE against the signed maximum value, change it to SETLT.
@@ -21812,9 +22706,8 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
/// corresponding X86 condition code constant in X86CC.
SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
ISD::CondCode CC, const SDLoc &dl,
- SelectionDAG &DAG, SDValue &X86CC,
- SDValue &Chain,
- bool IsSignaling) const {
+ SelectionDAG &DAG,
+ SDValue &X86CC) const {
// Optimize to BT if possible.
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
@@ -21825,13 +22718,12 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
return BT;
}
- // Try to use PTEST for a tree ORs equality compared with 0.
+ // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
// TODO: We could do AND tree with all 1s as well by using the C flag.
- if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
- (CC == ISD::SETEQ || CC == ISD::SETNE)) {
- if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
- return PTEST;
- }
+ if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
+ if (SDValue CmpZ =
+ MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
+ return CmpZ;
// Try to lower using KORTEST or KTEST.
if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
@@ -21873,17 +22765,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
}
}
- bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
- X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
- if (CondCode == X86::COND_INVALID)
- return SDValue();
+ X86::CondCode CondCode =
+ TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
+ assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
- std::pair<SDValue, SDValue> Tmp =
- EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);
- SDValue EFLAGS = Tmp.first;
- if (Chain)
- Chain = Tmp.second;
- EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
return EFLAGS;
}
@@ -21920,18 +22806,32 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
}
}
- SDValue X86CC;
- SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,
- Op.getOpcode() == ISD::STRICT_FSETCCS);
- if (!EFLAGS)
- return SDValue();
+ if (Op0.getSimpleValueType().isInteger()) {
+ SDValue X86CC;
+ SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
+ }
- SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ // Handle floating point.
+ X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
+ if (CondCode == X86::COND_INVALID)
+ return SDValue();
- if (IsStrict)
- return DAG.getMergeValues({Res, Chain}, dl);
+ SDValue EFLAGS;
+ if (IsStrict) {
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ EFLAGS =
+ DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
+ dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
+ Chain = EFLAGS.getValue(1);
+ } else {
+ EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
+ }
- return Res;
+ SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
}
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
@@ -21946,9 +22846,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const
// Recreate the carry if needed.
EVT CarryVT = Carry.getValueType();
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
- Carry, DAG.getConstant(NegOne, DL, CarryVT));
+ Carry, DAG.getAllOnesConstant(DL, CarryVT));
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
@@ -22024,7 +22923,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getOpcode();
if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
- Opc == X86ISD::SAHF)
+ Opc == X86ISD::FCMP)
return true;
if (Op.getResNo() == 1 &&
(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
@@ -22057,9 +22956,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
// are available or VBLENDV if AVX is available.
// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
- if (Cond.getOpcode() == ISD::SETCC &&
- ((Subtarget.hasSSE2() && VT == MVT::f64) ||
- (Subtarget.hasSSE1() && VT == MVT::f32)) &&
+ if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
bool IsAlwaysSignaling;
@@ -22115,45 +23012,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
// AVX512 fallback is to lower selects of scalar floats to masked moves.
- if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
+ if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
- // For v64i1 without 64-bit support we need to split and rejoin.
- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
- assert(Subtarget.hasBWI() && "Expected BWI to be legal");
- SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
- SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
- SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
- SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
- SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
- SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
- }
-
- if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
- SDValue Op1Scalar;
- if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
- Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
- else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
- Op1Scalar = Op1.getOperand(0);
- SDValue Op2Scalar;
- if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
- Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
- else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
- Op2Scalar = Op2.getOperand(0);
- if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
- SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
- Op1Scalar, Op2Scalar);
- if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
- return DAG.getBitcast(VT, newSelect);
- SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
- DAG.getIntPtrConstant(0, DL));
- }
- }
-
if (Cond.getOpcode() == ISD::SETCC) {
if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
@@ -22175,12 +23038,28 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
isNullConstant(Cond.getOperand(1).getOperand(1))) {
SDValue Cmp = Cond.getOperand(1);
+ SDValue CmpOp0 = Cmp.getOperand(0);
unsigned CondCode = Cond.getConstantOperandVal(0);
- if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+ // Special handling for __builtin_ffs(X) - 1 pattern which looks like
+ // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
+ // handle to keep the CMP with 0. This should be removed by
+ // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
+ // cttz_zero_undef.
+ auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
+ return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
+ Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
+ };
+ if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
+ ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
+ (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
+ // Keep Cmp.
+ } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
- SDValue CmpOp0 = Cmp.getOperand(0);
+
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
// Apply further optimizations for special cases
// (select (x != 0), -1, 0) -> neg & sbb
@@ -22188,31 +23067,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (isNullConstant(Y) &&
(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
- SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
Zero = DAG.getConstant(0, DL, Op.getValueType());
- return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
+ return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
}
- Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
- Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
SDValue Res = // Res = 0 or -1.
- DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
+ DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
Res = DAG.getNOT(DL, Res, Res.getValueType());
- if (!isNullConstant(Op2))
- Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
- return Res;
+ return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
Cmp.getOperand(0).getOpcode() == ISD::AND &&
isOneConstant(Cmp.getOperand(0).getOperand(1))) {
- SDValue CmpOp0 = Cmp.getOperand(0);
SDValue Src1, Src2;
// true if Op2 is XOR or OR operator and one of its operands
// is equal to Op1
@@ -22265,7 +23138,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = Cond.getOperand(1);
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
- !isScalarFPTypeInSSEReg(VT)) // FPStack?
+ !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov()) // FPStack?
IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -22311,7 +23184,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// a >= b ? -1 : 0 -> RES = setcc_carry
// a >= b ? 0 : -1 -> RES = ~setcc_carry
if (Cond.getOpcode() == X86ISD::SUB) {
- Cond = ConvertCmpIfNecessary(Cond, DAG);
unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -22333,7 +23205,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
if (T1.getValueType() == T2.getValueType() &&
- // Blacklist CopyFromReg to avoid partial register stalls.
+ // Exclude CopyFromReg to avoid partial register stalls.
T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
CC, Cond);
@@ -22570,14 +23442,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
InVT.getVectorElementType() == MVT::i32) &&
"Unexpected element type");
- // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
- if (InVT == MVT::v8i8) {
- if (VT != MVT::v8i64)
- return SDValue();
-
- In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
- MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
- return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
+ if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+ assert(InVT == MVT::v32i8 && "Unexpected VT!");
+ return splitVectorIntUnary(Op, DAG);
}
if (Subtarget.hasInt256())
@@ -22620,23 +23487,19 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
if (!Store->isSimple())
return SDValue();
- EVT StoreVT = StoredVal.getValueType();
- unsigned NumElems = StoreVT.getVectorNumElements();
- unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
- unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
-
SDLoc DL(Store);
- SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
- SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
+ SDValue Value0, Value1;
+ std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
+ unsigned HalfOffset = Value0.getValueType().getStoreSize();
SDValue Ptr0 = Store->getBasePtr();
- SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
- unsigned Alignment = Store->getAlignment();
+ SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
SDValue Ch0 =
DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
- Alignment, Store->getMemOperand()->getFlags());
+ Store->getOriginalAlign(),
+ Store->getMemOperand()->getFlags());
SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
- Store->getPointerInfo().getWithOffset(HalfAlign),
- MinAlign(Alignment, HalfAlign),
+ Store->getPointerInfo().getWithOffset(HalfOffset),
+ Store->getOriginalAlign(),
Store->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
}
@@ -22659,7 +23522,6 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
MVT StoreSVT = StoreVT.getScalarType();
unsigned NumElems = StoreVT.getVectorNumElements();
unsigned ScalarSize = StoreSVT.getStoreSize();
- unsigned Alignment = Store->getAlignment();
SDLoc DL(Store);
SmallVector<SDValue, 4> Stores;
@@ -22670,7 +23532,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
DAG.getIntPtrConstant(i, DL));
SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
Store->getPointerInfo().getWithOffset(Offset),
- MinAlign(Alignment, Offset),
+ Store->getOriginalAlign(),
Store->getMemOperand()->getFlags());
Stores.push_back(Ch);
}
@@ -22699,7 +23561,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -22711,7 +23573,9 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
// and each half can execute independently. Some cores would split the op into
// halves anyway, so the concat (vinsertf128) is purely an extra op.
MVT StoreVT = StoredVal.getSimpleValueType();
- if (StoreVT.is256BitVector()) {
+ if (StoreVT.is256BitVector() ||
+ ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
+ !Subtarget.hasBWI())) {
SmallVector<SDValue, 4> CatOps;
if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
return splitVectorStore(St, DAG);
@@ -22738,7 +23602,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
DAG.getIntPtrConstant(0, dl));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
assert(Subtarget.hasSSE1() && "Expected SSE");
@@ -22773,7 +23637,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
"Expected AVX512F without AVX512DQI");
SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
// Replace chain users with the new chain.
@@ -22801,163 +23665,44 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
Op.getOperand(1).hasOneUse());
}
-/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
-/// SETCC node has a single use.
-static bool isXor1OfSetCC(SDValue Op) {
- if (Op.getOpcode() != ISD::XOR)
- return false;
- if (isOneConstant(Op.getOperand(1)))
- return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
- Op.getOperand(0).hasOneUse();
- return false;
-}
-
SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
- bool addTest = true;
SDValue Chain = Op.getOperand(0);
SDValue Cond = Op.getOperand(1);
SDValue Dest = Op.getOperand(2);
SDLoc dl(Op);
- SDValue CC;
- bool Inverted = false;
- if (Cond.getOpcode() == ISD::SETCC) {
- // Check for setcc([su]{add,sub,mul}o == 0).
- if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
- isNullConstant(Cond.getOperand(1)) &&
- Cond.getOperand(0).getResNo() == 1 &&
- (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
- Cond.getOperand(0).getOpcode() == ISD::UADDO ||
- Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
- Cond.getOperand(0).getOpcode() == ISD::USUBO ||
- Cond.getOperand(0).getOpcode() == ISD::SMULO ||
- Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
- Inverted = true;
- Cond = Cond.getOperand(0);
- } else {
- if (SDValue NewCond = LowerSETCC(Cond, DAG))
- Cond = NewCond;
- }
- }
-#if 0
- // FIXME: LowerXALUO doesn't handle these!!
- else if (Cond.getOpcode() == X86ISD::ADD ||
- Cond.getOpcode() == X86ISD::SUB ||
- Cond.getOpcode() == X86ISD::SMUL ||
- Cond.getOpcode() == X86ISD::UMUL)
- Cond = LowerXALUO(Cond, DAG);
-#endif
+ if (Cond.getOpcode() == ISD::SETCC &&
+ Cond.getOperand(0).getValueType() != MVT::f128) {
+ SDValue LHS = Cond.getOperand(0);
+ SDValue RHS = Cond.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
- // Look pass (and (setcc_carry (cmp ...)), 1).
- if (Cond.getOpcode() == ISD::AND &&
- Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
- isOneConstant(Cond.getOperand(1)))
- Cond = Cond.getOperand(0);
+ // Special case for
+ // setcc([su]{add,sub,mul}o == 0)
+ // setcc([su]{add,sub,mul}o != 1)
+ if (ISD::isOverflowIntrOpRes(LHS) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ (isNullConstant(RHS) || isOneConstant(RHS))) {
+ SDValue Value, Overflow;
+ X86::CondCode X86Cond;
+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
- // If condition flag is set by a X86ISD::CMP, then use it as the condition
- // setting operand in place of the X86ISD::SETCC.
- unsigned CondOpcode = Cond.getOpcode();
- if (CondOpcode == X86ISD::SETCC ||
- CondOpcode == X86ISD::SETCC_CARRY) {
- CC = Cond.getOperand(0);
+ if ((CC == ISD::SETEQ) == isNullConstant(RHS))
+ X86Cond = X86::GetOppositeBranchCondition(X86Cond);
- SDValue Cmp = Cond.getOperand(1);
- unsigned Opc = Cmp.getOpcode();
- // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
- if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
- Cond = Cmp;
- addTest = false;
- } else {
- switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
- default: break;
- case X86::COND_O:
- case X86::COND_B:
- // These can only come from an arithmetic instruction with overflow,
- // e.g. SADDO, UADDO.
- Cond = Cond.getOperand(1);
- addTest = false;
- break;
- }
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
}
- }
- CondOpcode = Cond.getOpcode();
- if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
- CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
- CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
- SDValue Value;
- X86::CondCode X86Cond;
- std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
- if (Inverted)
- X86Cond = X86::GetOppositeBranchCondition(X86Cond);
+ if (LHS.getSimpleValueType().isInteger()) {
+ SDValue CCVal;
+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ EFLAGS);
+ }
- CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
- addTest = false;
- } else {
- unsigned CondOpc;
- if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
- SDValue Cmp = Cond.getOperand(0).getOperand(1);
- if (CondOpc == ISD::OR) {
- // Also, recognize the pattern generated by an FCMP_UNE. We can emit
- // two branches instead of an explicit OR instruction with a
- // separate test.
- if (Cmp == Cond.getOperand(1).getOperand(1) &&
- isX86LogicalCmp(Cmp)) {
- CC = Cond.getOperand(0).getOperand(0);
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- CC = Cond.getOperand(1).getOperand(0);
- Cond = Cmp;
- addTest = false;
- }
- } else { // ISD::AND
- // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
- // two branches instead of an explicit AND instruction with a
- // separate test. However, we only do this if this block doesn't
- // have a fall-through edge, because this requires an explicit
- // jmp when the condition is false.
- if (Cmp == Cond.getOperand(1).getOperand(1) &&
- isX86LogicalCmp(Cmp) &&
- Op.getNode()->hasOneUse()) {
- X86::CondCode CCode0 =
- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
- CCode0 = X86::GetOppositeBranchCondition(CCode0);
- CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
- SDNode *User = *Op.getNode()->use_begin();
- // Look for an unconditional branch following this conditional branch.
- // We need this because we need to reverse the successors in order
- // to implement FCMP_OEQ.
- if (User->getOpcode() == ISD::BR) {
- SDValue FalseBB = User->getOperand(1);
- SDNode *NewBR =
- DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
- assert(NewBR == User);
- (void)NewBR;
- Dest = FalseBB;
-
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
- Dest, CC, Cmp);
- X86::CondCode CCode1 =
- (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
- CCode1 = X86::GetOppositeBranchCondition(CCode1);
- CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
- Cond = Cmp;
- addTest = false;
- }
- }
- }
- } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
- // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
- // It should be transformed during dag combiner except when the condition
- // is set by a arithmetics with overflow node.
- X86::CondCode CCode =
- (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
- CCode = X86::GetOppositeBranchCondition(CCode);
- CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
- Cond = Cond.getOperand(0).getOperand(1);
- addTest = false;
- } else if (Cond.getOpcode() == ISD::SETCC &&
- cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+ if (CC == ISD::SETOEQ) {
// For FCMP_OEQ, we can emit
// two branches instead of an explicit AND instruction with a
// separate test. However, we only do this if this block doesn't
@@ -22976,59 +23721,65 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
(void)NewBR;
Dest = FalseBB;
- SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
- Cond.getOperand(0), Cond.getOperand(1));
- Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
- Cond = Cmp;
- addTest = false;
+ SDValue Cmp =
+ DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+ Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
+ CCVal, Cmp);
+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
}
}
- } else if (Cond.getOpcode() == ISD::SETCC &&
- cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+ } else if (CC == ISD::SETUNE) {
// For FCMP_UNE, we can emit
// two branches instead of an explicit OR instruction with a
// separate test.
- SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
- Cond.getOperand(0), Cond.getOperand(1));
- Cmp = ConvertCmpIfNecessary(Cmp, DAG);
- CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
- Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cmp);
- CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
- Cond = Cmp;
- addTest = false;
+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+ Chain =
+ DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
+ CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
+ } else {
+ X86::CondCode X86Cond =
+ TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
+ SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Cmp);
}
}
- if (addTest) {
- // Look pass the truncate if the high bits are known zero.
- if (isTruncWithZeroHighBitsInput(Cond, DAG))
- Cond = Cond.getOperand(0);
+ if (ISD::isOverflowIntrOpRes(Cond)) {
+ SDValue Value, Overflow;
+ X86::CondCode X86Cond;
+ std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
- // We know the result of AND is compared against zero. Try to match
- // it to BT.
- if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
- SDValue BTCC;
- if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
- CC = BTCC;
- Cond = BT;
- addTest = false;
- }
- }
+ SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ Overflow);
}
- if (addTest) {
- X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
- CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
- Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);
- }
- Cond = ConvertCmpIfNecessary(Cond, DAG);
- return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
- Chain, Dest, CC, Cond);
+ // Look past the truncate if the high bits are known zero.
+ if (isTruncWithZeroHighBitsInput(Cond, DAG))
+ Cond = Cond.getOperand(0);
+
+ EVT CondVT = Cond.getValueType();
+
+ // Add an AND with 1 if we don't already have one.
+ if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
+ Cond =
+ DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
+
+ SDValue LHS = Cond;
+ SDValue RHS = DAG.getConstant(0, dl, CondVT);
+
+ SDValue CCVal;
+ SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
+ return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+ EFLAGS);
}
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
@@ -23041,9 +23792,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
bool SplitStack = MF.shouldSplitStack();
- bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
+ bool EmitStackProbeCall = hasStackProbeSymbol(MF);
bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
- SplitStack || EmitStackProbe;
+ SplitStack || EmitStackProbeCall;
SDLoc dl(Op);
// Get the inputs.
@@ -23067,12 +23818,22 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!");
- SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
- Chain = SP.getValue(1);
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const Align StackAlign(TFI.getStackAlignment());
- Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
- if (Alignment && Alignment > StackAlign)
+ const Align StackAlign = TFI.getStackAlign();
+ if (hasInlineStackProbe(MF)) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+ Register Vreg = MRI.createVirtualRegister(AddrRegClass);
+ Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+ Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
+ DAG.getRegister(Vreg, SPTy));
+ } else {
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+ Chain = SP.getValue(1);
+ Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+ }
+ if (Alignment && *Alignment > StackAlign)
Result =
DAG.getNode(ISD::AND, dl, VT, Result,
DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
@@ -23203,14 +23964,13 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// Decide which area this value should be read from.
// TODO: Implement the AMD64 ABI in its entirety. This simple
// selection mechanism works only for the basic types.
- if (ArgVT == MVT::f80) {
- llvm_unreachable("va_arg for f80 not yet implemented");
- } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+ assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
+ if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
- } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
- ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
} else {
- llvm_unreachable("Unhandled argument type in LowerVAARG");
+ assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
+ "Unhandled argument type in LowerVAARG");
+ ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
}
if (ArgMode == 2) {
@@ -23227,11 +23987,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
SDValue VAARG = DAG.getMemIntrinsicNode(
- X86ISD::VAARG_64, dl,
- VTs, InstOps, MVT::i64,
- MachinePointerInfo(SV),
- /*Align=*/0,
- MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+ X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+ /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
Chain = VAARG.getValue(1);
// Load the next argument and return it
@@ -23255,9 +24012,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
SDLoc DL(Op);
- return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
- DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
- false, false,
+ return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
+ Align(8), /*isVolatile*/ false, false, false,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
@@ -24004,8 +24760,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
+ // Some conditions require the operands to be swapped.
+ if (CC == ISD::SETLT || CC == ISD::SETLE)
+ std::swap(LHS, RHS);
+
SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
- SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
SDValue SetCC;
switch (CC) {
case ISD::SETEQ: { // (ZF = 0 and PF = 0)
@@ -24021,18 +24780,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
break;
}
case ISD::SETGT: // (CF = 0 and ZF = 0)
+ case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
break;
- case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
- SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
- break;
}
case ISD::SETGE: // CF = 0
+ case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
break;
- case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
- SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
- break;
default:
llvm_unreachable("Unexpected illegal condition!");
}
@@ -24481,6 +25236,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// Clamp out of bounds shift amounts since they will otherwise be masked
// to 8-bits which may make it no longer out of bounds.
unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
+ if (ShiftAmount == 0)
+ return Op.getOperand(1);
+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
Op.getOperand(0), Op.getOperand(1),
DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
@@ -24540,19 +25298,23 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
TLI.getPointerTy(DAG.getDataLayout()));
EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+ // Cast mask to an integer type.
+ Mask = DAG.getBitcast(MaskVT, Mask);
+
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
- return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
}
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
@@ -24577,7 +25339,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
if (Mask.getValueType() != MaskVT)
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
// If source is undef or we know it won't be used, use a zero vector
// to break register dependency.
// TODO: use undef instead and let BreakFalseDeps deal with it?
@@ -24587,9 +25349,10 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
- return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -24615,11 +25378,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
- return Res.getValue(1);
+ SDValue Res =
+ DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+ return Res;
}
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -24778,13 +25542,11 @@ static SDValue
EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
SelectionDAG &DAG) {
-
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
SDValue Ops[] = { Chain, Val, Ptr, Undef };
- return SignedSat ?
- DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
- DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+ unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
}
/// Emit Masked Truncating Store with signed or unsigned saturation.
@@ -24792,12 +25554,10 @@ static SDValue
EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) {
-
SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = { Chain, Val, Ptr, Mask };
- return SignedSat ?
- DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
- DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+ unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
+ return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
}
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
@@ -25147,7 +25907,7 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
}
-unsigned X86TargetLowering::getExceptionPointerRegister(
+Register X86TargetLowering::getExceptionPointerRegister(
const Constant *PersonalityFn) const {
if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
@@ -25155,7 +25915,7 @@ unsigned X86TargetLowering::getExceptionPointerRegister(
return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
}
-unsigned X86TargetLowering::getExceptionSelectorRegister(
+Register X86TargetLowering::getExceptionSelectorRegister(
const Constant *PersonalityFn) const {
// Funclet personalities don't use selectors (the runtime does the selection).
assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
@@ -25179,7 +25939,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
"Invalid Frame Register!");
SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
- unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+ Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
DAG.getIntPtrConstant(RegInfo->getSlotSize(),
@@ -25393,93 +26153,51 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
2 Round to +inf
3 Round to -inf
- To perform the conversion, we do:
- (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+ To perform the conversion, we use a packed lookup table of the four 2-bit
+ values that we can index by FPSP[11:10]
+ 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
+
+ (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
*/
MachineFunction &MF = DAG.getMachineFunction();
- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const Align StackAlignment(TFI.getStackAlignment());
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
// Save FP Control Word to stack slot
- int SSFI =
- MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);
+ int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
SDValue StackSlot =
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
- MachineMemOperand *MMO =
- MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
- MachineMemOperand::MOStore, 2, 2);
+ MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
- SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
- SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
- DAG.getVTList(MVT::Other),
- Ops, MVT::i16, MMO);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Ops[] = {Chain, StackSlot};
+ Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+ DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
+ Align(2), MachineMemOperand::MOStore);
// Load FP Control Word from stack slot
- SDValue CWD =
- DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
+ SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
+ Chain = CWD.getValue(1);
- // Transform as necessary
- SDValue CWD1 =
+ // Mask and turn the control bits into a shift for the lookup table.
+ SDValue Shift =
DAG.getNode(ISD::SRL, DL, MVT::i16,
DAG.getNode(ISD::AND, DL, MVT::i16,
- CWD, DAG.getConstant(0x800, DL, MVT::i16)),
- DAG.getConstant(11, DL, MVT::i8));
- SDValue CWD2 =
- DAG.getNode(ISD::SRL, DL, MVT::i16,
- DAG.getNode(ISD::AND, DL, MVT::i16,
- CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+ CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
DAG.getConstant(9, DL, MVT::i8));
+ Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
+ SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
SDValue RetVal =
- DAG.getNode(ISD::AND, DL, MVT::i16,
- DAG.getNode(ISD::ADD, DL, MVT::i16,
- DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
- DAG.getConstant(1, DL, MVT::i16)),
- DAG.getConstant(3, DL, MVT::i16));
+ DAG.getNode(ISD::AND, DL, MVT::i32,
+ DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
+ DAG.getConstant(3, DL, MVT::i32));
- return DAG.getNode((VT.getSizeInBits() < 16 ?
- ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
-}
+ RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
-// Split an unary integer op into 2 half sized ops.
-static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
- unsigned NumElems = VT.getVectorNumElements();
- unsigned SizeInBits = VT.getSizeInBits();
- MVT EltVT = VT.getVectorElementType();
- SDValue Src = Op.getOperand(0);
- assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
- "Src and Op should have the same element type!");
-
- // Extract the Lo/Hi vectors
- SDLoc dl(Op);
- SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
- SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
-
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
- DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
-}
-
-// Decompose 256-bit ops into smaller 128-bit ops.
-static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getSimpleValueType().is256BitVector() &&
- Op.getSimpleValueType().isInteger() &&
- "Only handle AVX 256-bit vector integer operation");
- return LowerVectorIntUnary(Op, DAG);
-}
-
-// Decompose 512-bit ops into smaller 256-bit ops.
-static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
- assert(Op.getSimpleValueType().is512BitVector() &&
- Op.getSimpleValueType().isInteger() &&
- "Only handle AVX 512-bit vector integer operation");
- return LowerVectorIntUnary(Op, DAG);
+ return DAG.getMergeValues({RetVal, Chain}, DL);
}
/// Lower a vector CTLZ using native supported vector CTLZ instruction.
@@ -25502,7 +26220,7 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
// Split vector, it's Lo and Hi parts will be handled in next iteration.
if (NumElems > 16 ||
(NumElems == 16 && !Subtarget.canExtendTo512DQ()))
- return LowerVectorIntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
@@ -25612,11 +26330,11 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
- return Lower512IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
@@ -25682,64 +26400,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
}
-/// Break a 256-bit integer operation into two new 128-bit ones and then
-/// concatenate the result back.
-static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
-
- assert(VT.is256BitVector() && VT.isInteger() &&
- "Unsupported value type for operation");
-
- unsigned NumElems = VT.getVectorNumElements();
- SDLoc dl(Op);
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
-
- // Extract the RHS vectors
- SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
-}
-
-/// Break a 512-bit integer operation into two new 256-bit ones and then
-/// concatenate the result back.
-static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
-
- assert(VT.is512BitVector() && VT.isInteger() &&
- "Unsupported value type for operation");
-
- unsigned NumElems = VT.getVectorNumElements();
- SDLoc dl(Op);
-
- // Extract the LHS vectors
- SDValue LHS = Op.getOperand(0);
- SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
- SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
-
- // Extract the RHS vectors
- SDValue RHS = Op.getOperand(1);
- SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
- SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
-
- MVT EltVT = VT.getVectorElementType();
- MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
- DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
-}
-
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
MVT VT = Op.getSimpleValueType();
@@ -25750,10 +26410,13 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
Op.getOperand(0), Op.getOperand(1));
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
}
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
@@ -25798,10 +26461,13 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
}
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
@@ -25831,9 +26497,12 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
if (VT.is256BitVector() && !Subtarget.hasInt256()) {
assert(VT.isInteger() &&
"Only handle AVX 256-bit vector integer operation");
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
}
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
+
// Default to expand.
return SDValue();
}
@@ -25843,7 +26512,10 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
// For AVX1 cases, split to use legal ops (everything but v4i64).
if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
SDLoc DL(Op);
unsigned Opcode = Op.getOpcode();
@@ -25887,7 +26559,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
SDValue A = Op.getOperand(0);
SDValue B = Op.getOperand(1);
@@ -26033,7 +26708,10 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+ return splitVectorIntBinary(Op, DAG);
if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
@@ -26122,41 +26800,9 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
}
- // For signed 512-bit vectors, split into 256-bit vectors to allow the
- // sign-extension to occur.
- if (VT == MVT::v64i8 && IsSigned)
- return split512IntArith(Op, DAG);
-
- // Signed AVX2 implementation - extend xmm subvectors to ymm.
- if (VT == MVT::v32i8 && IsSigned) {
- MVT ExVT = MVT::v16i16;
- SDValue ALo = extract128BitVector(A, 0, DAG, dl);
- SDValue BLo = extract128BitVector(B, 0, DAG, dl);
- SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
- SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
- ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
- BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
- AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
- BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
- SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
- SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
- Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
- Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
-
- // Bitcast back to VT and then pack all the even elements from Lo and Hi.
- // Shuffle lowering should turn this into PACKUS+PERMQ
- Lo = DAG.getBitcast(VT, Lo);
- Hi = DAG.getBitcast(VT, Hi);
- return DAG.getVectorShuffle(VT, dl, Lo, Hi,
- { 0, 2, 4, 6, 8, 10, 12, 14,
- 16, 18, 20, 22, 24, 26, 28, 30,
- 32, 34, 36, 38, 40, 42, 44, 46,
- 48, 50, 52, 54, 56, 58, 60, 62});
- }
-
- // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
- // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
- // shift the results and pack the half lane results back together.
+ // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
+ // to a vXi16 type. Do the multiplies, shift the results and pack the half
+ // lane results back together.
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -26270,9 +26916,12 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
"Unexpected argument type for lowering");
SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
Entry.Node = StackPtr;
InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
- MachinePointerInfo(), /* Alignment = */ 16);
+ MPI, /* Alignment = */ 16);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
Entry.Ty = PointerType::get(ArgTy,0);
Entry.IsSExt = false;
@@ -26413,7 +27062,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
return ArithmeticShiftRight64(ShiftAmt);
if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
- VT == MVT::v64i8) {
+ (Subtarget.hasBWI() && VT == MVT::v64i8)) {
unsigned NumElts = VT.getVectorNumElements();
MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
@@ -26859,8 +27508,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
- (VT == MVT::v16i8 || VT == MVT::v64i8 ||
- (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+ (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
!Subtarget.hasXOP()) {
int NumElts = VT.getVectorNumElements();
SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
@@ -26923,12 +27572,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
ISD::SETGT);
return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
} else if (Subtarget.hasSSE41()) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just
+ // on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
@@ -27038,14 +27688,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just on
+ // the sign bit.
if (UseSSE41) {
MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
V0 = DAG.getBitcast(ExtVT, V0);
V1 = DAG.getBitcast(ExtVT, V1);
Sel = DAG.getBitcast(ExtVT, Sel);
- return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
+ return DAG.getBitcast(
+ VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
}
// On pre-SSE41 targets we splat the sign bit - a negative value will
// set all bits of the lanes to true and VSELECT uses that in
@@ -27096,7 +27747,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit shifts into 128-bit shifts.
if (VT.is256BitVector())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
+
+ if (VT == MVT::v32i16 || VT == MVT::v64i8)
+ return splitVectorIntBinary(Op, DAG);
return SDValue();
}
@@ -27114,28 +27768,21 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
int NumElts = VT.getVectorNumElements();
// Check for constant splat rotation amount.
- APInt UndefElts;
- SmallVector<APInt, 32> EltBits;
- int CstSplatIndex = -1;
- if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
- for (int i = 0; i != NumElts; ++i)
- if (!UndefElts[i]) {
- if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
- CstSplatIndex = i;
- continue;
- }
- CstSplatIndex = -1;
- break;
- }
+ APInt CstSplatValue;
+ bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
+
+ // Check for splat rotate by zero.
+ if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
+ return R;
// AVX512 implicitly uses modulo rotation amounts.
if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
- if (0 <= CstSplatIndex) {
- unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
- uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
- return DAG.getNode(Op, DL, VT, R,
- DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ if (IsCstSplat) {
+ unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
+ return DAG.getNode(RotOpc, DL, VT, R,
+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));
}
// Else, fall-back on VPROLV/VPRORV.
@@ -27149,14 +27796,14 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// XOP implicitly uses modulo rotation amounts.
if (Subtarget.hasXOP()) {
if (VT.is256BitVector())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
// Attempt to rotate by immediate.
- if (0 <= CstSplatIndex) {
- uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
+ if (IsCstSplat) {
+ uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
- DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+ DAG.getTargetConstant(RotAmt, DL, MVT::i8));
}
// Use general rotate by variable (per-element).
@@ -27165,7 +27812,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
// Split 256-bit integers on pre-AVX2 targets.
if (VT.is256BitVector() && !Subtarget.hasAVX2())
- return split256IntArith(Op, DAG);
+ return splitVectorIntBinary(Op, DAG);
assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
@@ -27173,7 +27820,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
"Only vXi32/vXi16/vXi8 vector rotates supported");
// Rotate by an uniform constant - expand back to shifts.
- if (0 <= CstSplatIndex)
+ if (IsCstSplat)
return SDValue();
bool IsSplatAmt = DAG.isSplatValue(Amt);
@@ -27189,12 +27836,13 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
if (Subtarget.hasSSE41()) {
- // On SSE41 targets we make use of the fact that VSELECT lowers
- // to PBLENDVB which selects bytes based just on the sign bit.
+ // On SSE41 targets we can use PBLENDVB which selects bytes based just
+ // on the sign bit.
V0 = DAG.getBitcast(VT, V0);
V1 = DAG.getBitcast(VT, V1);
Sel = DAG.getBitcast(VT, Sel);
- return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
+ return DAG.getBitcast(SelVT,
+ DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
}
// On pre-SSE41 targets we test for the sign bit by comparing to
// zero - a negative value will set all bits of the lanes to true
@@ -27306,15 +27954,14 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
return false;
}
-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
-// TODO: In 32-bit mode, use FISTP when X87 is available?
bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
Type *MemType = SI->getValueOperand()->getType();
bool NoImplicitFloatOps =
SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
- !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
+ !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
return false;
return needsCmpXchgNb(MemType);
@@ -27333,7 +27980,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- (Subtarget.hasSSE2() || Subtarget.hasX87()))
+ (Subtarget.hasSSE1() || Subtarget.hasX87()))
return AtomicExpansionKind::None;
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
@@ -27399,7 +28046,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
AI->use_empty())
return nullptr;
- auto Builder = IRBuilder<>(AI);
+ IRBuilder<> Builder(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
@@ -27441,7 +28088,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// Finally we can emit the atomic load.
LoadInst *Loaded =
Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
- AI->getType()->getPrimitiveSizeInBits());
+ Align(AI->getType()->getPrimitiveSizeInBits()));
Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
@@ -27636,18 +28283,6 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
}
- // Custom splitting for BWI types when AVX512F is available but BWI isn't.
- if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
- DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
- SDLoc dl(Op);
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
- MVT CastVT = DstVT.getHalfNumVectorElementsVT();
- Lo = DAG.getBitcast(CastVT, Lo);
- Hi = DAG.getBitcast(CastVT, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
- }
-
// Use MOVMSK for vector to scalar conversion to prevent scalarization.
if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
@@ -27831,11 +28466,11 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// Decompose 512-bit ops into smaller 256-bit ops.
if (VT.is512BitVector() && !Subtarget.hasBWI())
- return Lower512IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// For element types greater than i8, do vXi8 pop counts and a bytesum.
if (VT.getScalarType() != MVT::i8) {
@@ -27879,7 +28514,7 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
assert(VT.is128BitVector() &&
"Only 128-bit vector bitreverse lowering supported.");
@@ -27916,12 +28551,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
- // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
- // lowering.
- if (VT == MVT::v8i64 || VT == MVT::v16i32) {
- assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
- return Lower512IntUnary(Op, DAG);
- }
+ // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
+ if (VT == MVT::v64i8 && !Subtarget.hasBWI())
+ return splitVectorIntUnary(Op, DAG);
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarType() == MVT::i8 &&
@@ -27929,7 +28561,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
if (VT.is256BitVector() && !Subtarget.hasInt256())
- return Lower256IntUnary(Op, DAG);
+ return splitVectorIntUnary(Op, DAG);
// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
// two nibbles and a PSHUFB lookup to find the bitreverse of each
@@ -28073,28 +28705,54 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
return Op;
if (VT == MVT::i64 && !IsTypeLegal) {
- // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
- // FIXME: Use movlps with SSE1.
- // FIXME: Use fist with X87.
+ // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
+ // is enabled.
bool NoImplicitFloatOps =
DAG.getMachineFunction().getFunction().hasFnAttribute(
Attribute::NoImplicitFloat);
- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- Subtarget.hasSSE2()) {
- SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
- Node->getOperand(2));
- SDVTList Tys = DAG.getVTList(MVT::Other);
- SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
- SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
- Ops, MVT::i64,
- Node->getMemOperand());
+ if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+ SDValue Chain;
+ if (Subtarget.hasSSE1()) {
+ SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+ Node->getOperand(2));
+ MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+ SclToVec = DAG.getBitcast(StVT, SclToVec);
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
+ Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
+ MVT::i64, Node->getMemOperand());
+ } else if (Subtarget.hasX87()) {
+ // First load this into an 80-bit X87 register using a stack temporary.
+ // This will put the whole integer into the significand.
+ SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+ int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+ Chain =
+ DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
+ MPI, /*Align*/ 0, MachineMemOperand::MOStore);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+ SDValue LdOps[] = {Chain, StackPtr};
+ SDValue Value =
+ DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
+ /*Align*/ None, MachineMemOperand::MOLoad);
+ Chain = Value.getValue(1);
+
+ // Now use an FIST to do the atomic store.
+ SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
+ Chain =
+ DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
+ StoreOps, MVT::i64, Node->getMemOperand());
+ }
- // If this is a sequentially consistent store, also emit an appropriate
- // barrier.
- if (IsSeqCst)
- Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+ if (Chain) {
+ // If this is a sequentially consistent store, also emit an appropriate
+ // barrier.
+ if (IsSeqCst)
+ Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
- return Chain;
+ return Chain;
+ }
}
}
@@ -28123,9 +28781,8 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
// Set the carry flag.
SDValue Carry = Op.getOperand(2);
EVT CarryVT = Carry.getValueType();
- APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
- Carry, DAG.getConstant(NegOne, DL, CarryVT));
+ Carry, DAG.getAllOnesConstant(DL, CarryVT));
unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
@@ -28170,7 +28827,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
- : (Type *)VectorType::get(ArgTy, 4);
+ : (Type *)FixedVectorType::get(ArgTy, 4);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
@@ -28267,17 +28924,15 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
- SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+ SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- return SDValue(NewScatter.getNode(), 1);
+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ N->getMemoryVT(), N->getMemOperand());
}
return SDValue();
}
MVT IndexVT = Index.getSimpleValueType();
- MVT MaskVT = Mask.getSimpleValueType();
// If the index is v2i32, we're being called by type legalization and we
// should just let the default handling take care of it.
@@ -28295,18 +28950,17 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
- MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
Src = ExtendToType(Src, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
Mask = ExtendToType(Mask, MaskVT, DAG, true);
}
- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+ SDVTList VTs = DAG.getVTList(MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
- SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
- VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
- return SDValue(NewScatter.getNode(), 1);
+ return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+ N->getMemoryVT(), N->getMemOperand());
}
static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
@@ -28332,8 +28986,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
N->isExpandingLoad());
// Emit a blend.
- SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
- PassThru);
+ SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
}
@@ -28369,10 +29022,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
N->getExtensionType(), N->isExpandingLoad());
- SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- NewLoad.getValue(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+ SDValue Extract =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
return DAG.getMergeValues(RetOps, dl);
}
@@ -28430,7 +29083,6 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Mask = N->getMask();
SDValue PassThru = N->getPassThru();
MVT IndexVT = Index.getSimpleValueType();
- MVT MaskVT = Mask.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
@@ -28451,7 +29103,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
- MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
PassThru = ExtendToType(PassThru, VT, DAG);
Index = ExtendToType(Index, IndexVT, DAG);
@@ -28460,12 +29112,12 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
N->getScale() };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
+ SDValue NewGather = DAG.getMemIntrinsicNode(
+ X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
N->getMemOperand());
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
NewGather, DAG.getIntPtrConstant(0, dl));
- return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
+ return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
}
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
@@ -28531,6 +29183,20 @@ SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
return Tmp.first;
}
+// Custom split CVTPS2PH with wide types.
+static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ SDValue RC = Op.getOperand(1);
+ Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
+ Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+}
+
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -28584,14 +29250,21 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::FP_ROUND:
case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
+ case ISD::FP16_TO_FP:
+ case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
+ case ISD::FP_TO_FP16:
+ case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
case ISD::FSUB: return lowerFaddFsub(Op, DAG);
+ case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
+ case ISD::LRINT:
+ case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
case ISD::SETCC:
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
@@ -28659,8 +29332,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case ISD::GC_TRANSITION_START:
case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
- case ISD::ADDRSPACECAST:
- return LowerADDRSPACECAST(Op, DAG);
+ case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
+ case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
}
}
@@ -28706,6 +29379,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
N->dump(&DAG);
#endif
llvm_unreachable("Do not know how to custom type legalize this operation!");
+ case X86ISD::CVTPH2PS: {
+ EVT VT = N->getValueType(0);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
+ Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ return;
+ }
+ case X86ISD::STRICT_CVTPH2PS: {
+ EVT VT = N->getValueType(0);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
+ EVT LoVT, HiVT;
+ std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+ Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
+ {N->getOperand(0), Lo});
+ Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
+ {N->getOperand(0), Hi});
+ SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Lo.getValue(1), Hi.getValue(1));
+ SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
case ISD::CTPOP: {
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
// Use a v2i64 if possible.
@@ -28775,7 +29477,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ABS: {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
assert(N->getValueType(0) == MVT::i64 &&
"Unexpected type (!= i64) on ABS.");
MVT HalfT = MVT::i32;
@@ -28788,15 +29489,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
DAG.getConstant(1, dl, HalfT));
Tmp = DAG.getNode(
ISD::SRA, dl, HalfT, Hi,
- DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
- TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
+ DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
SDValue(Lo.getNode(), 1));
Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
- Results.push_back(Lo);
- Results.push_back(Hi);
+ Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
return;
}
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
@@ -29148,6 +29847,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case ISD::LRINT:
+ case ISD::LLRINT: {
+ if (SDValue V = LRINT_LLRINTHelper(N, DAG))
+ Results.push_back(V);
+ return;
+ }
+
case ISD::SINT_TO_FP:
case ISD::STRICT_SINT_TO_FP:
case ISD::UINT_TO_FP:
@@ -29185,14 +29891,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
for (int i = 0; i != 2; ++i) {
- SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
SignSrc, DAG.getIntPtrConstant(i, dl));
if (IsStrict)
SignCvts[i] =
DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
- {N->getOperand(0), Src});
+ {N->getOperand(0), Elt});
else
- SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
};
SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
SDValue Slow, Chain;
@@ -29272,7 +29978,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(V.getValue(1));
return;
}
- case ISD::FP_EXTEND: {
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND: {
// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
// No other ValueType for FP_EXTEND should reach this point.
assert(N->getValueType(0) == MVT::v2f32 &&
@@ -29394,15 +30101,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Attribute::NoImplicitFloat);
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
auto *Node = cast<AtomicSDNode>(N);
- if (Subtarget.hasSSE2()) {
- // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
- // lower 64-bits.
- SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+ if (Subtarget.hasSSE1()) {
+ // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
+ // Then extract the lower 64-bits.
+ MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+ SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Node->getMemOperand());
- SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ if (Subtarget.hasSSE2()) {
+ SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Ld.getValue(1));
+ return;
+ }
+ // We use an alternative sequence for SSE1 that extracts as v2f32 and
+ // then casts to i64. This avoids a 128-bit stack temporary being
+ // created by type legalization if we were to cast v4f32->v2i64.
+ SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getBitcast(MVT::i64, Res);
Results.push_back(Res);
Results.push_back(Ld.getValue(1));
return;
@@ -29410,14 +30129,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (Subtarget.hasX87()) {
// First load this into an 80-bit X87 register. This will put the whole
// integer into the significand.
- // FIXME: Do we need to glue? See FIXME comment in BuildFILD.
- SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
+ SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
- SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
+ SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
dl, Tys, Ops, MVT::i64,
Node->getMemOperand());
SDValue Chain = Result.getValue(1);
- SDValue InFlag = Result.getValue(2);
// Now store the X87 register to a stack temporary and convert to i64.
// This store is not atomic and doesn't need to be.
@@ -29427,11 +30144,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
MachinePointerInfo MPI =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
- SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
- Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
- DAG.getVTList(MVT::Other), StoreOps,
- MVT::i64, MPI, 0 /*Align*/,
- MachineMemOperand::MOStore);
+ SDValue StoreOps[] = { Chain, Result, StackPtr };
+ Chain = DAG.getMemIntrinsicNode(
+ X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
+ MPI, None /*Align*/, MachineMemOperand::MOStore);
// Finally load the value back from the stack temporary and return it.
// This load is not atomic and doesn't need to be.
@@ -29480,24 +30196,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
- // Custom splitting for BWI types when AVX512F is available but BWI isn't.
- if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
- SrcVT.isVector() && isTypeLegal(SrcVT)) {
- SDValue Lo, Hi;
- std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
- MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
- Lo = DAG.getBitcast(CastVT, Lo);
- Hi = DAG.getBitcast(CastVT, Hi);
- SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
- Results.push_back(Res);
- return;
- }
-
if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+ // FIXME: Use v4f32 for SSE1?
+ assert(Subtarget.hasSSE2() && "Requires SSE2");
assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
"Unexpected type action!");
EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
- SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+ SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
+ N->getOperand(0));
+ Res = DAG.getBitcast(WideVT, Res);
Results.push_back(Res);
return;
}
@@ -29529,11 +30236,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
Gather->getBasePtr(), Index, Gather->getScale() };
- SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
- Gather->getMemoryVT(), Gather->getMemOperand());
+ SDValue Res = DAG.getMemIntrinsicNode(
+ X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
+ Gather->getMemoryVT(), Gather->getMemOperand());
Results.push_back(Res);
- Results.push_back(Res.getValue(2));
+ Results.push_back(Res.getValue(1));
return;
}
return;
@@ -29552,7 +30259,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (Subtarget.hasSSE2()) {
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
MVT VecVT = MVT::getVectorVT(LdVT, 2);
@@ -29573,25 +30280,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::ADDRSPACECAST: {
- SDValue Src = N->getOperand(0);
- EVT DstVT = N->getValueType(0);
- AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
- unsigned SrcAS = CastN->getSrcAddressSpace();
-
- assert(SrcAS != CastN->getDestAddressSpace() &&
- "addrspacecast must be between different address spaces");
-
- SDValue Res;
- if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
- Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
- else if (DstVT == MVT::i64)
- Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
- else if (DstVT == MVT::i32)
- Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
- else
- report_fatal_error("Unrecognized addrspacecast type legalization");
-
- Results.push_back(Res);
+ SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
+ Results.push_back(V);
return;
}
}
@@ -29600,362 +30290,367 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((X86ISD::NodeType)Opcode) {
case X86ISD::FIRST_NUMBER: break;
- case X86ISD::BSF: return "X86ISD::BSF";
- case X86ISD::BSR: return "X86ISD::BSR";
- case X86ISD::SHLD: return "X86ISD::SHLD";
- case X86ISD::SHRD: return "X86ISD::SHRD";
- case X86ISD::FAND: return "X86ISD::FAND";
- case X86ISD::FANDN: return "X86ISD::FANDN";
- case X86ISD::FOR: return "X86ISD::FOR";
- case X86ISD::FXOR: return "X86ISD::FXOR";
- case X86ISD::FILD: return "X86ISD::FILD";
- case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
- case X86ISD::FIST: return "X86ISD::FIST";
- case X86ISD::FP_TO_INT_IN_MEM: return "X86ISD::FP_TO_INT_IN_MEM";
- case X86ISD::FLD: return "X86ISD::FLD";
- case X86ISD::FST: return "X86ISD::FST";
- case X86ISD::CALL: return "X86ISD::CALL";
- case X86ISD::BT: return "X86ISD::BT";
- case X86ISD::CMP: return "X86ISD::CMP";
- case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP";
- case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS";
- case X86ISD::COMI: return "X86ISD::COMI";
- case X86ISD::UCOMI: return "X86ISD::UCOMI";
- case X86ISD::CMPM: return "X86ISD::CMPM";
- case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM";
- case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
- case X86ISD::SETCC: return "X86ISD::SETCC";
- case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
- case X86ISD::FSETCC: return "X86ISD::FSETCC";
- case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
- case X86ISD::FSETCCM_SAE: return "X86ISD::FSETCCM_SAE";
- case X86ISD::CMOV: return "X86ISD::CMOV";
- case X86ISD::BRCOND: return "X86ISD::BRCOND";
- case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
- case X86ISD::IRET: return "X86ISD::IRET";
- case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
- case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
- case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
- case X86ISD::Wrapper: return "X86ISD::Wrapper";
- case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
- case X86ISD::MOVQ2DQ: return "X86ISD::MOVQ2DQ";
- case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
- case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
- case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
- case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
- case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
- case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
- case X86ISD::PINSRB: return "X86ISD::PINSRB";
- case X86ISD::PINSRW: return "X86ISD::PINSRW";
- case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
- case X86ISD::ANDNP: return "X86ISD::ANDNP";
- case X86ISD::BLENDI: return "X86ISD::BLENDI";
- case X86ISD::BLENDV: return "X86ISD::BLENDV";
- case X86ISD::HADD: return "X86ISD::HADD";
- case X86ISD::HSUB: return "X86ISD::HSUB";
- case X86ISD::FHADD: return "X86ISD::FHADD";
- case X86ISD::FHSUB: return "X86ISD::FHSUB";
- case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
- case X86ISD::FMAX: return "X86ISD::FMAX";
- case X86ISD::FMAXS: return "X86ISD::FMAXS";
- case X86ISD::FMAX_SAE: return "X86ISD::FMAX_SAE";
- case X86ISD::FMAXS_SAE: return "X86ISD::FMAXS_SAE";
- case X86ISD::FMIN: return "X86ISD::FMIN";
- case X86ISD::FMINS: return "X86ISD::FMINS";
- case X86ISD::FMIN_SAE: return "X86ISD::FMIN_SAE";
- case X86ISD::FMINS_SAE: return "X86ISD::FMINS_SAE";
- case X86ISD::FMAXC: return "X86ISD::FMAXC";
- case X86ISD::FMINC: return "X86ISD::FMINC";
- case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
- case X86ISD::FRCP: return "X86ISD::FRCP";
- case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
- case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
- case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
- case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
- case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
- case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
- case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
- case X86ISD::EH_SJLJ_SETUP_DISPATCH:
- return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
- case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
- case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
- case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
- case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
- case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
- case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
- case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
- case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
- return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
- case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
- return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
- case X86ISD::LADD: return "X86ISD::LADD";
- case X86ISD::LSUB: return "X86ISD::LSUB";
- case X86ISD::LOR: return "X86ISD::LOR";
- case X86ISD::LXOR: return "X86ISD::LXOR";
- case X86ISD::LAND: return "X86ISD::LAND";
- case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
- case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
- case X86ISD::VEXTRACT_STORE: return "X86ISD::VEXTRACT_STORE";
- case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
- case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
- case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
- case X86ISD::VMTRUNC: return "X86ISD::VMTRUNC";
- case X86ISD::VMTRUNCS: return "X86ISD::VMTRUNCS";
- case X86ISD::VMTRUNCUS: return "X86ISD::VMTRUNCUS";
- case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
- case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
- case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
- case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
- case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
- case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT";
- case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
- case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
- case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
- case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
- case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND";
- case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
- case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
- case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
- case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
- case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
- case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
- case X86ISD::VSHL: return "X86ISD::VSHL";
- case X86ISD::VSRL: return "X86ISD::VSRL";
- case X86ISD::VSRA: return "X86ISD::VSRA";
- case X86ISD::VSHLI: return "X86ISD::VSHLI";
- case X86ISD::VSRLI: return "X86ISD::VSRLI";
- case X86ISD::VSRAI: return "X86ISD::VSRAI";
- case X86ISD::VSHLV: return "X86ISD::VSHLV";
- case X86ISD::VSRLV: return "X86ISD::VSRLV";
- case X86ISD::VSRAV: return "X86ISD::VSRAV";
- case X86ISD::VROTLI: return "X86ISD::VROTLI";
- case X86ISD::VROTRI: return "X86ISD::VROTRI";
- case X86ISD::VPPERM: return "X86ISD::VPPERM";
- case X86ISD::CMPP: return "X86ISD::CMPP";
- case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP";
- case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
- case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
- case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
- case X86ISD::ADD: return "X86ISD::ADD";
- case X86ISD::SUB: return "X86ISD::SUB";
- case X86ISD::ADC: return "X86ISD::ADC";
- case X86ISD::SBB: return "X86ISD::SBB";
- case X86ISD::SMUL: return "X86ISD::SMUL";
- case X86ISD::UMUL: return "X86ISD::UMUL";
- case X86ISD::OR: return "X86ISD::OR";
- case X86ISD::XOR: return "X86ISD::XOR";
- case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::BEXTR: return "X86ISD::BEXTR";
- case X86ISD::BZHI: return "X86ISD::BZHI";
- case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
- case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
- case X86ISD::PTEST: return "X86ISD::PTEST";
- case X86ISD::TESTP: return "X86ISD::TESTP";
- case X86ISD::KORTEST: return "X86ISD::KORTEST";
- case X86ISD::KTEST: return "X86ISD::KTEST";
- case X86ISD::KADD: return "X86ISD::KADD";
- case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
- case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
- case X86ISD::PACKSS: return "X86ISD::PACKSS";
- case X86ISD::PACKUS: return "X86ISD::PACKUS";
- case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
- case X86ISD::VALIGN: return "X86ISD::VALIGN";
- case X86ISD::VSHLD: return "X86ISD::VSHLD";
- case X86ISD::VSHRD: return "X86ISD::VSHRD";
- case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
- case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
- case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
- case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
- case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
- case X86ISD::SHUFP: return "X86ISD::SHUFP";
- case X86ISD::SHUF128: return "X86ISD::SHUF128";
- case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
- case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
- case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
- case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
- case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
- case X86ISD::MOVSD: return "X86ISD::MOVSD";
- case X86ISD::MOVSS: return "X86ISD::MOVSS";
- case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
- case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
- case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
- case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD";
- case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
- case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
- case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
- case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
- case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
- case X86ISD::VPERMV: return "X86ISD::VPERMV";
- case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
- case X86ISD::VPERMI: return "X86ISD::VPERMI";
- case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
- case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
- case X86ISD::VFIXUPIMM_SAE: return "X86ISD::VFIXUPIMM_SAE";
- case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
- case X86ISD::VFIXUPIMMS_SAE: return "X86ISD::VFIXUPIMMS_SAE";
- case X86ISD::VRANGE: return "X86ISD::VRANGE";
- case X86ISD::VRANGE_SAE: return "X86ISD::VRANGE_SAE";
- case X86ISD::VRANGES: return "X86ISD::VRANGES";
- case X86ISD::VRANGES_SAE: return "X86ISD::VRANGES_SAE";
- case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
- case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
- case X86ISD::PSADBW: return "X86ISD::PSADBW";
- case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
- case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
- case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
- case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
- case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
- case X86ISD::MFENCE: return "X86ISD::MFENCE";
- case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
- case X86ISD::SAHF: return "X86ISD::SAHF";
- case X86ISD::RDRAND: return "X86ISD::RDRAND";
- case X86ISD::RDSEED: return "X86ISD::RDSEED";
- case X86ISD::RDPKRU: return "X86ISD::RDPKRU";
- case X86ISD::WRPKRU: return "X86ISD::WRPKRU";
- case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
- case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
- case X86ISD::VPSHA: return "X86ISD::VPSHA";
- case X86ISD::VPSHL: return "X86ISD::VPSHL";
- case X86ISD::VPCOM: return "X86ISD::VPCOM";
- case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
- case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
- case X86ISD::FMSUB: return "X86ISD::FMSUB";
- case X86ISD::FNMADD: return "X86ISD::FNMADD";
- case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
- case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
- case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
- case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
- case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
- case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
- case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
- case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
- case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
- case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
- case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
- case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
- case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE";
- case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
- case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
- case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
- case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
- case X86ISD::VREDUCE_SAE: return "X86ISD::VREDUCE_SAE";
- case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
- case X86ISD::VREDUCES_SAE: return "X86ISD::VREDUCES_SAE";
- case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
- case X86ISD::VGETMANT_SAE: return "X86ISD::VGETMANT_SAE";
- case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
- case X86ISD::VGETMANTS_SAE: return "X86ISD::VGETMANTS_SAE";
- case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
- case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
- case X86ISD::XTEST: return "X86ISD::XTEST";
- case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
- case X86ISD::EXPAND: return "X86ISD::EXPAND";
- case X86ISD::SELECTS: return "X86ISD::SELECTS";
- case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
- case X86ISD::RCP14: return "X86ISD::RCP14";
- case X86ISD::RCP14S: return "X86ISD::RCP14S";
- case X86ISD::RCP28: return "X86ISD::RCP28";
- case X86ISD::RCP28_SAE: return "X86ISD::RCP28_SAE";
- case X86ISD::RCP28S: return "X86ISD::RCP28S";
- case X86ISD::RCP28S_SAE: return "X86ISD::RCP28S_SAE";
- case X86ISD::EXP2: return "X86ISD::EXP2";
- case X86ISD::EXP2_SAE: return "X86ISD::EXP2_SAE";
- case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
- case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
- case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
- case X86ISD::RSQRT28_SAE: return "X86ISD::RSQRT28_SAE";
- case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
- case X86ISD::RSQRT28S_SAE: return "X86ISD::RSQRT28S_SAE";
- case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
- case X86ISD::FADDS: return "X86ISD::FADDS";
- case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
- case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
- case X86ISD::FSUBS: return "X86ISD::FSUBS";
- case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
- case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
- case X86ISD::FMULS: return "X86ISD::FMULS";
- case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
- case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
- case X86ISD::FDIVS: return "X86ISD::FDIVS";
- case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
- case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
- case X86ISD::FSQRTS: return "X86ISD::FSQRTS";
- case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
- case X86ISD::FGETEXP: return "X86ISD::FGETEXP";
- case X86ISD::FGETEXP_SAE: return "X86ISD::FGETEXP_SAE";
- case X86ISD::FGETEXPS: return "X86ISD::FGETEXPS";
- case X86ISD::FGETEXPS_SAE: return "X86ISD::FGETEXPS_SAE";
- case X86ISD::SCALEF: return "X86ISD::SCALEF";
- case X86ISD::SCALEF_RND: return "X86ISD::SCALEF_RND";
- case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
- case X86ISD::SCALEFS_RND: return "X86ISD::SCALEFS_RND";
- case X86ISD::AVG: return "X86ISD::AVG";
- case X86ISD::MULHRS: return "X86ISD::MULHRS";
- case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
- case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
- case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
- case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
- case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI";
- case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI";
- case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
- case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
- case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
- case X86ISD::CVTTP2UI_SAE: return "X86ISD::CVTTP2UI_SAE";
- case X86ISD::CVTTS2SI: return "X86ISD::CVTTS2SI";
- case X86ISD::CVTTS2UI: return "X86ISD::CVTTS2UI";
- case X86ISD::CVTTS2SI_SAE: return "X86ISD::CVTTS2SI_SAE";
- case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
- case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
- case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
- case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P";
- case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P";
- case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
- case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
- case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
- case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
- case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
- case X86ISD::SCALAR_SINT_TO_FP: return "X86ISD::SCALAR_SINT_TO_FP";
- case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
- case X86ISD::SCALAR_UINT_TO_FP: return "X86ISD::SCALAR_UINT_TO_FP";
- case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
- case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
- case X86ISD::MCVTPS2PH: return "X86ISD::MCVTPS2PH";
- case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
- case X86ISD::CVTPH2PS_SAE: return "X86ISD::CVTPH2PS_SAE";
- case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
- case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
- case X86ISD::MCVTP2SI: return "X86ISD::MCVTP2SI";
- case X86ISD::MCVTP2UI: return "X86ISD::MCVTP2UI";
- case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
- case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
- case X86ISD::CVTS2SI: return "X86ISD::CVTS2SI";
- case X86ISD::CVTS2UI: return "X86ISD::CVTS2UI";
- case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
- case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
- case X86ISD::CVTNE2PS2BF16: return "X86ISD::CVTNE2PS2BF16";
- case X86ISD::CVTNEPS2BF16: return "X86ISD::CVTNEPS2BF16";
- case X86ISD::MCVTNEPS2BF16: return "X86ISD::MCVTNEPS2BF16";
- case X86ISD::DPBF16PS: return "X86ISD::DPBF16PS";
- case X86ISD::LWPINS: return "X86ISD::LWPINS";
- case X86ISD::MGATHER: return "X86ISD::MGATHER";
- case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
- case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
- case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
- case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
- case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
- case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
- case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
- case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
- case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
- case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
- case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
- case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
- case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
- case X86ISD::ENQCMD: return "X86ISD:ENQCMD";
- case X86ISD::ENQCMDS: return "X86ISD:ENQCMDS";
- case X86ISD::VP2INTERSECT: return "X86ISD::VP2INTERSECT";
+#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
+ NODE_NAME_CASE(BSF)
+ NODE_NAME_CASE(BSR)
+ NODE_NAME_CASE(FSHL)
+ NODE_NAME_CASE(FSHR)
+ NODE_NAME_CASE(FAND)
+ NODE_NAME_CASE(FANDN)
+ NODE_NAME_CASE(FOR)
+ NODE_NAME_CASE(FXOR)
+ NODE_NAME_CASE(FILD)
+ NODE_NAME_CASE(FIST)
+ NODE_NAME_CASE(FP_TO_INT_IN_MEM)
+ NODE_NAME_CASE(FLD)
+ NODE_NAME_CASE(FST)
+ NODE_NAME_CASE(CALL)
+ NODE_NAME_CASE(BT)
+ NODE_NAME_CASE(CMP)
+ NODE_NAME_CASE(FCMP)
+ NODE_NAME_CASE(STRICT_FCMP)
+ NODE_NAME_CASE(STRICT_FCMPS)
+ NODE_NAME_CASE(COMI)
+ NODE_NAME_CASE(UCOMI)
+ NODE_NAME_CASE(CMPM)
+ NODE_NAME_CASE(STRICT_CMPM)
+ NODE_NAME_CASE(CMPM_SAE)
+ NODE_NAME_CASE(SETCC)
+ NODE_NAME_CASE(SETCC_CARRY)
+ NODE_NAME_CASE(FSETCC)
+ NODE_NAME_CASE(FSETCCM)
+ NODE_NAME_CASE(FSETCCM_SAE)
+ NODE_NAME_CASE(CMOV)
+ NODE_NAME_CASE(BRCOND)
+ NODE_NAME_CASE(RET_FLAG)
+ NODE_NAME_CASE(IRET)
+ NODE_NAME_CASE(REP_STOS)
+ NODE_NAME_CASE(REP_MOVS)
+ NODE_NAME_CASE(GlobalBaseReg)
+ NODE_NAME_CASE(Wrapper)
+ NODE_NAME_CASE(WrapperRIP)
+ NODE_NAME_CASE(MOVQ2DQ)
+ NODE_NAME_CASE(MOVDQ2Q)
+ NODE_NAME_CASE(MMX_MOVD2W)
+ NODE_NAME_CASE(MMX_MOVW2D)
+ NODE_NAME_CASE(PEXTRB)
+ NODE_NAME_CASE(PEXTRW)
+ NODE_NAME_CASE(INSERTPS)
+ NODE_NAME_CASE(PINSRB)
+ NODE_NAME_CASE(PINSRW)
+ NODE_NAME_CASE(PSHUFB)
+ NODE_NAME_CASE(ANDNP)
+ NODE_NAME_CASE(BLENDI)
+ NODE_NAME_CASE(BLENDV)
+ NODE_NAME_CASE(HADD)
+ NODE_NAME_CASE(HSUB)
+ NODE_NAME_CASE(FHADD)
+ NODE_NAME_CASE(FHSUB)
+ NODE_NAME_CASE(CONFLICT)
+ NODE_NAME_CASE(FMAX)
+ NODE_NAME_CASE(FMAXS)
+ NODE_NAME_CASE(FMAX_SAE)
+ NODE_NAME_CASE(FMAXS_SAE)
+ NODE_NAME_CASE(FMIN)
+ NODE_NAME_CASE(FMINS)
+ NODE_NAME_CASE(FMIN_SAE)
+ NODE_NAME_CASE(FMINS_SAE)
+ NODE_NAME_CASE(FMAXC)
+ NODE_NAME_CASE(FMINC)
+ NODE_NAME_CASE(FRSQRT)
+ NODE_NAME_CASE(FRCP)
+ NODE_NAME_CASE(EXTRQI)
+ NODE_NAME_CASE(INSERTQI)
+ NODE_NAME_CASE(TLSADDR)
+ NODE_NAME_CASE(TLSBASEADDR)
+ NODE_NAME_CASE(TLSCALL)
+ NODE_NAME_CASE(EH_SJLJ_SETJMP)
+ NODE_NAME_CASE(EH_SJLJ_LONGJMP)
+ NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
+ NODE_NAME_CASE(EH_RETURN)
+ NODE_NAME_CASE(TC_RETURN)
+ NODE_NAME_CASE(FNSTCW16m)
+ NODE_NAME_CASE(LCMPXCHG_DAG)
+ NODE_NAME_CASE(LCMPXCHG8_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_DAG)
+ NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
+ NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
+ NODE_NAME_CASE(LADD)
+ NODE_NAME_CASE(LSUB)
+ NODE_NAME_CASE(LOR)
+ NODE_NAME_CASE(LXOR)
+ NODE_NAME_CASE(LAND)
+ NODE_NAME_CASE(VZEXT_MOVL)
+ NODE_NAME_CASE(VZEXT_LOAD)
+ NODE_NAME_CASE(VEXTRACT_STORE)
+ NODE_NAME_CASE(VTRUNC)
+ NODE_NAME_CASE(VTRUNCS)
+ NODE_NAME_CASE(VTRUNCUS)
+ NODE_NAME_CASE(VMTRUNC)
+ NODE_NAME_CASE(VMTRUNCS)
+ NODE_NAME_CASE(VMTRUNCUS)
+ NODE_NAME_CASE(VTRUNCSTORES)
+ NODE_NAME_CASE(VTRUNCSTOREUS)
+ NODE_NAME_CASE(VMTRUNCSTORES)
+ NODE_NAME_CASE(VMTRUNCSTOREUS)
+ NODE_NAME_CASE(VFPEXT)
+ NODE_NAME_CASE(STRICT_VFPEXT)
+ NODE_NAME_CASE(VFPEXT_SAE)
+ NODE_NAME_CASE(VFPEXTS)
+ NODE_NAME_CASE(VFPEXTS_SAE)
+ NODE_NAME_CASE(VFPROUND)
+ NODE_NAME_CASE(STRICT_VFPROUND)
+ NODE_NAME_CASE(VMFPROUND)
+ NODE_NAME_CASE(VFPROUND_RND)
+ NODE_NAME_CASE(VFPROUNDS)
+ NODE_NAME_CASE(VFPROUNDS_RND)
+ NODE_NAME_CASE(VSHLDQ)
+ NODE_NAME_CASE(VSRLDQ)
+ NODE_NAME_CASE(VSHL)
+ NODE_NAME_CASE(VSRL)
+ NODE_NAME_CASE(VSRA)
+ NODE_NAME_CASE(VSHLI)
+ NODE_NAME_CASE(VSRLI)
+ NODE_NAME_CASE(VSRAI)
+ NODE_NAME_CASE(VSHLV)
+ NODE_NAME_CASE(VSRLV)
+ NODE_NAME_CASE(VSRAV)
+ NODE_NAME_CASE(VROTLI)
+ NODE_NAME_CASE(VROTRI)
+ NODE_NAME_CASE(VPPERM)
+ NODE_NAME_CASE(CMPP)
+ NODE_NAME_CASE(STRICT_CMPP)
+ NODE_NAME_CASE(PCMPEQ)
+ NODE_NAME_CASE(PCMPGT)
+ NODE_NAME_CASE(PHMINPOS)
+ NODE_NAME_CASE(ADD)
+ NODE_NAME_CASE(SUB)
+ NODE_NAME_CASE(ADC)
+ NODE_NAME_CASE(SBB)
+ NODE_NAME_CASE(SMUL)
+ NODE_NAME_CASE(UMUL)
+ NODE_NAME_CASE(OR)
+ NODE_NAME_CASE(XOR)
+ NODE_NAME_CASE(AND)
+ NODE_NAME_CASE(BEXTR)
+ NODE_NAME_CASE(BZHI)
+ NODE_NAME_CASE(PDEP)
+ NODE_NAME_CASE(PEXT)
+ NODE_NAME_CASE(MUL_IMM)
+ NODE_NAME_CASE(MOVMSK)
+ NODE_NAME_CASE(PTEST)
+ NODE_NAME_CASE(TESTP)
+ NODE_NAME_CASE(KORTEST)
+ NODE_NAME_CASE(KTEST)
+ NODE_NAME_CASE(KADD)
+ NODE_NAME_CASE(KSHIFTL)
+ NODE_NAME_CASE(KSHIFTR)
+ NODE_NAME_CASE(PACKSS)
+ NODE_NAME_CASE(PACKUS)
+ NODE_NAME_CASE(PALIGNR)
+ NODE_NAME_CASE(VALIGN)
+ NODE_NAME_CASE(VSHLD)
+ NODE_NAME_CASE(VSHRD)
+ NODE_NAME_CASE(VSHLDV)
+ NODE_NAME_CASE(VSHRDV)
+ NODE_NAME_CASE(PSHUFD)
+ NODE_NAME_CASE(PSHUFHW)
+ NODE_NAME_CASE(PSHUFLW)
+ NODE_NAME_CASE(SHUFP)
+ NODE_NAME_CASE(SHUF128)
+ NODE_NAME_CASE(MOVLHPS)
+ NODE_NAME_CASE(MOVHLPS)
+ NODE_NAME_CASE(MOVDDUP)
+ NODE_NAME_CASE(MOVSHDUP)
+ NODE_NAME_CASE(MOVSLDUP)
+ NODE_NAME_CASE(MOVSD)
+ NODE_NAME_CASE(MOVSS)
+ NODE_NAME_CASE(UNPCKL)
+ NODE_NAME_CASE(UNPCKH)
+ NODE_NAME_CASE(VBROADCAST)
+ NODE_NAME_CASE(VBROADCAST_LOAD)
+ NODE_NAME_CASE(VBROADCASTM)
+ NODE_NAME_CASE(SUBV_BROADCAST)
+ NODE_NAME_CASE(VPERMILPV)
+ NODE_NAME_CASE(VPERMILPI)
+ NODE_NAME_CASE(VPERM2X128)
+ NODE_NAME_CASE(VPERMV)
+ NODE_NAME_CASE(VPERMV3)
+ NODE_NAME_CASE(VPERMI)
+ NODE_NAME_CASE(VPTERNLOG)
+ NODE_NAME_CASE(VFIXUPIMM)
+ NODE_NAME_CASE(VFIXUPIMM_SAE)
+ NODE_NAME_CASE(VFIXUPIMMS)
+ NODE_NAME_CASE(VFIXUPIMMS_SAE)
+ NODE_NAME_CASE(VRANGE)
+ NODE_NAME_CASE(VRANGE_SAE)
+ NODE_NAME_CASE(VRANGES)
+ NODE_NAME_CASE(VRANGES_SAE)
+ NODE_NAME_CASE(PMULUDQ)
+ NODE_NAME_CASE(PMULDQ)
+ NODE_NAME_CASE(PSADBW)
+ NODE_NAME_CASE(DBPSADBW)
+ NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
+ NODE_NAME_CASE(VAARG_64)
+ NODE_NAME_CASE(WIN_ALLOCA)
+ NODE_NAME_CASE(MEMBARRIER)
+ NODE_NAME_CASE(MFENCE)
+ NODE_NAME_CASE(SEG_ALLOCA)
+ NODE_NAME_CASE(PROBED_ALLOCA)
+ NODE_NAME_CASE(RDRAND)
+ NODE_NAME_CASE(RDSEED)
+ NODE_NAME_CASE(RDPKRU)
+ NODE_NAME_CASE(WRPKRU)
+ NODE_NAME_CASE(VPMADDUBSW)
+ NODE_NAME_CASE(VPMADDWD)
+ NODE_NAME_CASE(VPSHA)
+ NODE_NAME_CASE(VPSHL)
+ NODE_NAME_CASE(VPCOM)
+ NODE_NAME_CASE(VPCOMU)
+ NODE_NAME_CASE(VPERMIL2)
+ NODE_NAME_CASE(FMSUB)
+ NODE_NAME_CASE(STRICT_FMSUB)
+ NODE_NAME_CASE(FNMADD)
+ NODE_NAME_CASE(STRICT_FNMADD)
+ NODE_NAME_CASE(FNMSUB)
+ NODE_NAME_CASE(STRICT_FNMSUB)
+ NODE_NAME_CASE(FMADDSUB)
+ NODE_NAME_CASE(FMSUBADD)
+ NODE_NAME_CASE(FMADD_RND)
+ NODE_NAME_CASE(FNMADD_RND)
+ NODE_NAME_CASE(FMSUB_RND)
+ NODE_NAME_CASE(FNMSUB_RND)
+ NODE_NAME_CASE(FMADDSUB_RND)
+ NODE_NAME_CASE(FMSUBADD_RND)
+ NODE_NAME_CASE(VPMADD52H)
+ NODE_NAME_CASE(VPMADD52L)
+ NODE_NAME_CASE(VRNDSCALE)
+ NODE_NAME_CASE(STRICT_VRNDSCALE)
+ NODE_NAME_CASE(VRNDSCALE_SAE)
+ NODE_NAME_CASE(VRNDSCALES)
+ NODE_NAME_CASE(VRNDSCALES_SAE)
+ NODE_NAME_CASE(VREDUCE)
+ NODE_NAME_CASE(VREDUCE_SAE)
+ NODE_NAME_CASE(VREDUCES)
+ NODE_NAME_CASE(VREDUCES_SAE)
+ NODE_NAME_CASE(VGETMANT)
+ NODE_NAME_CASE(VGETMANT_SAE)
+ NODE_NAME_CASE(VGETMANTS)
+ NODE_NAME_CASE(VGETMANTS_SAE)
+ NODE_NAME_CASE(PCMPESTR)
+ NODE_NAME_CASE(PCMPISTR)
+ NODE_NAME_CASE(XTEST)
+ NODE_NAME_CASE(COMPRESS)
+ NODE_NAME_CASE(EXPAND)
+ NODE_NAME_CASE(SELECTS)
+ NODE_NAME_CASE(ADDSUB)
+ NODE_NAME_CASE(RCP14)
+ NODE_NAME_CASE(RCP14S)
+ NODE_NAME_CASE(RCP28)
+ NODE_NAME_CASE(RCP28_SAE)
+ NODE_NAME_CASE(RCP28S)
+ NODE_NAME_CASE(RCP28S_SAE)
+ NODE_NAME_CASE(EXP2)
+ NODE_NAME_CASE(EXP2_SAE)
+ NODE_NAME_CASE(RSQRT14)
+ NODE_NAME_CASE(RSQRT14S)
+ NODE_NAME_CASE(RSQRT28)
+ NODE_NAME_CASE(RSQRT28_SAE)
+ NODE_NAME_CASE(RSQRT28S)
+ NODE_NAME_CASE(RSQRT28S_SAE)
+ NODE_NAME_CASE(FADD_RND)
+ NODE_NAME_CASE(FADDS)
+ NODE_NAME_CASE(FADDS_RND)
+ NODE_NAME_CASE(FSUB_RND)
+ NODE_NAME_CASE(FSUBS)
+ NODE_NAME_CASE(FSUBS_RND)
+ NODE_NAME_CASE(FMUL_RND)
+ NODE_NAME_CASE(FMULS)
+ NODE_NAME_CASE(FMULS_RND)
+ NODE_NAME_CASE(FDIV_RND)
+ NODE_NAME_CASE(FDIVS)
+ NODE_NAME_CASE(FDIVS_RND)
+ NODE_NAME_CASE(FSQRT_RND)
+ NODE_NAME_CASE(FSQRTS)
+ NODE_NAME_CASE(FSQRTS_RND)
+ NODE_NAME_CASE(FGETEXP)
+ NODE_NAME_CASE(FGETEXP_SAE)
+ NODE_NAME_CASE(FGETEXPS)
+ NODE_NAME_CASE(FGETEXPS_SAE)
+ NODE_NAME_CASE(SCALEF)
+ NODE_NAME_CASE(SCALEF_RND)
+ NODE_NAME_CASE(SCALEFS)
+ NODE_NAME_CASE(SCALEFS_RND)
+ NODE_NAME_CASE(AVG)
+ NODE_NAME_CASE(MULHRS)
+ NODE_NAME_CASE(SINT_TO_FP_RND)
+ NODE_NAME_CASE(UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTTP2SI)
+ NODE_NAME_CASE(CVTTP2UI)
+ NODE_NAME_CASE(STRICT_CVTTP2SI)
+ NODE_NAME_CASE(STRICT_CVTTP2UI)
+ NODE_NAME_CASE(MCVTTP2SI)
+ NODE_NAME_CASE(MCVTTP2UI)
+ NODE_NAME_CASE(CVTTP2SI_SAE)
+ NODE_NAME_CASE(CVTTP2UI_SAE)
+ NODE_NAME_CASE(CVTTS2SI)
+ NODE_NAME_CASE(CVTTS2UI)
+ NODE_NAME_CASE(CVTTS2SI_SAE)
+ NODE_NAME_CASE(CVTTS2UI_SAE)
+ NODE_NAME_CASE(CVTSI2P)
+ NODE_NAME_CASE(CVTUI2P)
+ NODE_NAME_CASE(STRICT_CVTSI2P)
+ NODE_NAME_CASE(STRICT_CVTUI2P)
+ NODE_NAME_CASE(MCVTSI2P)
+ NODE_NAME_CASE(MCVTUI2P)
+ NODE_NAME_CASE(VFPCLASS)
+ NODE_NAME_CASE(VFPCLASSS)
+ NODE_NAME_CASE(MULTISHIFT)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP)
+ NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
+ NODE_NAME_CASE(CVTPS2PH)
+ NODE_NAME_CASE(STRICT_CVTPS2PH)
+ NODE_NAME_CASE(MCVTPS2PH)
+ NODE_NAME_CASE(CVTPH2PS)
+ NODE_NAME_CASE(STRICT_CVTPH2PS)
+ NODE_NAME_CASE(CVTPH2PS_SAE)
+ NODE_NAME_CASE(CVTP2SI)
+ NODE_NAME_CASE(CVTP2UI)
+ NODE_NAME_CASE(MCVTP2SI)
+ NODE_NAME_CASE(MCVTP2UI)
+ NODE_NAME_CASE(CVTP2SI_RND)
+ NODE_NAME_CASE(CVTP2UI_RND)
+ NODE_NAME_CASE(CVTS2SI)
+ NODE_NAME_CASE(CVTS2UI)
+ NODE_NAME_CASE(CVTS2SI_RND)
+ NODE_NAME_CASE(CVTS2UI_RND)
+ NODE_NAME_CASE(CVTNE2PS2BF16)
+ NODE_NAME_CASE(CVTNEPS2BF16)
+ NODE_NAME_CASE(MCVTNEPS2BF16)
+ NODE_NAME_CASE(DPBF16PS)
+ NODE_NAME_CASE(LWPINS)
+ NODE_NAME_CASE(MGATHER)
+ NODE_NAME_CASE(MSCATTER)
+ NODE_NAME_CASE(VPDPBUSD)
+ NODE_NAME_CASE(VPDPBUSDS)
+ NODE_NAME_CASE(VPDPWSSD)
+ NODE_NAME_CASE(VPDPWSSDS)
+ NODE_NAME_CASE(VPSHUFBITQMB)
+ NODE_NAME_CASE(GF2P8MULB)
+ NODE_NAME_CASE(GF2P8AFFINEQB)
+ NODE_NAME_CASE(GF2P8AFFINEINVQB)
+ NODE_NAME_CASE(NT_CALL)
+ NODE_NAME_CASE(NT_BRIND)
+ NODE_NAME_CASE(UMWAIT)
+ NODE_NAME_CASE(TPAUSE)
+ NODE_NAME_CASE(ENQCMD)
+ NODE_NAME_CASE(ENQCMDS)
+ NODE_NAME_CASE(VP2INTERSECT)
}
return nullptr;
+#undef NODE_NAME_CASE
}
/// Return true if the addressing mode represented by AM is legal for this
@@ -30021,7 +30716,8 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
return false;
// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
- if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
+ // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
+ if (Subtarget.hasXOP() &&
(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
return false;
@@ -30107,7 +30803,7 @@ bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
}
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
- if (!VT1.isInteger() || !VT2.isInteger())
+ if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
@@ -30148,6 +30844,39 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
+bool X86TargetLowering::shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const {
+ // A uniform shift amount in a vector shift or funnel shift may be much
+ // cheaper than a generic variable vector shift, so make that pattern visible
+ // to SDAG by sinking the shuffle instruction next to the shift.
+ int ShiftAmountOpNum = -1;
+ if (I->isShift())
+ ShiftAmountOpNum = 1;
+ else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::fshl ||
+ II->getIntrinsicID() == Intrinsic::fshr)
+ ShiftAmountOpNum = 2;
+ }
+
+ if (ShiftAmountOpNum == -1)
+ return false;
+
+ auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
+ if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
+ isVectorShiftByScalarCheap(I->getType())) {
+ Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
+ return true;
+ }
+
+ return false;
+}
+
+bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
+ if (!Subtarget.is64Bit())
+ return false;
+ return TargetLowering::shouldConvertPhiType(From, To);
+}
+
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
return false;
@@ -30191,7 +30920,7 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
/// VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
-bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
if (!VT.isSimple())
return false;
@@ -30336,7 +31065,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
MachineOperand &Segment = MI.getOperand(5);
unsigned ArgSize = MI.getOperand(6).getImm();
unsigned ArgMode = MI.getOperand(7).getImm();
- unsigned Align = MI.getOperand(8).getImm();
+ Align Alignment = Align(MI.getOperand(8).getImm());
MachineFunction *MF = MBB->getParent();
@@ -30376,7 +31105,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
/* Align ArgSize to a multiple of 8 */
unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
- bool NeedsAlign = (Align > 8);
+ bool NeedsAlign = (Alignment > 8);
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *overflowMBB;
@@ -30524,17 +31253,16 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
// to OverflowDestReg.
if (NeedsAlign) {
// Align the overflow address
- assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
// aligned_addr = (addr + (align-1)) & ~(align-1)
BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
- .addReg(OverflowAddrReg)
- .addImm(Align-1);
+ .addReg(OverflowAddrReg)
+ .addImm(Alignment.value() - 1);
BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
- .addReg(TmpReg)
- .addImm(~(uint64_t)(Align-1));
+ .addReg(TmpReg)
+ .addImm(~(uint64_t)(Alignment.value() - 1));
} else {
BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
.addReg(OverflowAddrReg);
@@ -30630,7 +31358,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
MachineMemOperand *MMO = F->getMachineMemOperand(
MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
MachineMemOperand::MOStore,
- /*Size=*/16, /*Align=*/16);
+ /*Size=*/16, Align(16));
BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
.addFrameIndex(RegSaveFrameIndex)
.addImm(/*Scale=*/1)
@@ -30697,11 +31425,13 @@ static bool isCMOVPseudo(MachineInstr &MI) {
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
+ case X86::CMOV_VR64:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
+ case X86::CMOV_VK1:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
@@ -30998,8 +31728,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
- ++NextMIIt;
- NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
+ NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
}
}
@@ -31071,6 +31800,112 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
return SinkMBB;
}
+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
+ if (IsLP64) {
+ if (isInt<8>(Imm))
+ return X86::SUB64ri8;
+ return X86::SUB64ri32;
+ } else {
+ if (isInt<8>(Imm))
+ return X86::SUB32ri8;
+ return X86::SUB32ri;
+ }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
+ DebugLoc DL = MI.getDebugLoc();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+
+ const unsigned ProbeSize = getStackProbeSize(*MF);
+
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator MBBIter = ++MBB->getIterator();
+ MF->insert(MBBIter, testMBB);
+ MF->insert(MBBIter, blockMBB);
+ MF->insert(MBBIter, tailMBB);
+
+ Register sizeVReg = MI.getOperand(1).getReg();
+
+ Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
+
+ Register TmpStackPtr = MRI.createVirtualRegister(
+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+ Register FinalStackPtr = MRI.createVirtualRegister(
+ TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+
+ BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
+ .addReg(physSPReg);
+ {
+ const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
+ BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
+ .addReg(TmpStackPtr)
+ .addReg(sizeVReg);
+ }
+
+ // test rsp size
+
+ BuildMI(testMBB, DL,
+ TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+ .addReg(FinalStackPtr)
+ .addReg(physSPReg);
+
+ BuildMI(testMBB, DL, TII->get(X86::JCC_1))
+ .addMBB(tailMBB)
+ .addImm(X86::COND_L);
+ testMBB->addSuccessor(blockMBB);
+ testMBB->addSuccessor(tailMBB);
+
+ // Touch the block then extend it. This is done on the opposite side of
+ // static probe where we allocate then touch, to avoid the need of probing the
+ // tail of the static alloca. Possible scenarios are:
+ //
+ // + ---- <- ------------ <- ------------- <- ------------ +
+ // | |
+ // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
+ // | |
+ // + <- ----------- <- ------------ <- ----------- <- ------------ +
+ //
+ // The property we want to enforce is to never have more than [page alloc] between two probes.
+
+ const unsigned MovMIOpc =
+ TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
+ addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
+ .addImm(0);
+
+ BuildMI(blockMBB, DL,
+ TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
+ .addReg(physSPReg)
+ .addImm(ProbeSize);
+
+
+ BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
+ blockMBB->addSuccessor(testMBB);
+
+ // Replace original instruction by the expected stack ptr
+ BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+ .addReg(FinalStackPtr);
+
+ tailMBB->splice(tailMBB->end(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
+ MBB->addSuccessor(testMBB);
+
+ // Delete the original pseudo instruction.
+ MI.eraseFromParent();
+
+ // And we're done.
+ return tailMBB;
+}
+
MachineBasicBlock *
X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -31231,29 +32066,16 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
BB->addSuccessor(RestoreMBB);
MI.getOperand(0).setMBB(RestoreMBB);
+ // Marking this as an EH pad but not a funclet entry block causes PEI to
+ // restore stack pointers in the block.
+ RestoreMBB->setIsEHPad(true);
+
auto RestoreMBBI = RestoreMBB->begin();
- BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
return BB;
}
MachineBasicBlock *
-X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
- MachineBasicBlock *BB) const {
- MachineFunction *MF = BB->getParent();
- const Constant *PerFn = MF->getFunction().getPersonalityFn();
- bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
- // Only 32-bit SEH requires special handling for catchpad.
- if (IsSEH && Subtarget.is32Bit()) {
- const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
- BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
- }
- MI.eraseFromParent();
- return BB;
-}
-
-MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
MachineBasicBlock *BB) const {
// So, here we replace TLSADDR with the sequence:
@@ -31755,12 +32577,17 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
MBB->addSuccessor(checkSspMBB);
// Initialize a register with zero.
- Register ZReg = MRI.createVirtualRegister(PtrRC);
- unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
- BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
- .addDef(ZReg)
- .addReg(ZReg, RegState::Undef)
- .addReg(ZReg, RegState::Undef);
+ Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+ BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
+
+ if (PVT == MVT::i64) {
+ Register TmpZReg = MRI.createVirtualRegister(PtrRC);
+ BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
+ .addImm(0)
+ .addReg(ZReg)
+ .addImm(X86::sub_32bit);
+ ZReg = TmpZReg;
+ }
// Read the current SSP Register value to the zeroed register.
Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
@@ -31889,7 +32716,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
Register Tmp = MRI.createVirtualRegister(RC);
// Since FP is only updated here but NOT referenced, it's treated as GPR.
const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+ Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
Register SP = RegInfo->getStackRegister();
MachineInstrBuilder MIB;
@@ -32236,6 +33063,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
DebugLoc DL = MI.getDebugLoc();
+ auto TMMImmToTMMReg = [](unsigned Imm) {
+ assert (Imm < 8 && "Illegal tmm index");
+ return X86::TMM0 + Imm;
+ };
switch (MI.getOpcode()) {
default: llvm_unreachable("Unexpected instr type to insert");
case X86::TLS_addr32:
@@ -32250,11 +33081,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return EmitLoweredIndirectThunk(MI, BB);
case X86::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
- case X86::CATCHPAD:
- return EmitLoweredCatchPad(MI, BB);
case X86::SEG_ALLOCA_32:
case X86::SEG_ALLOCA_64:
return EmitLoweredSegAlloca(MI, BB);
+ case X86::PROBED_ALLOCA_32:
+ case X86::PROBED_ALLOCA_64:
+ return EmitLoweredProbedAlloca(MI, BB);
case X86::TLSCall_32:
case X86::TLSCall_64:
return EmitLoweredTLSCall(MI, BB);
@@ -32268,11 +33100,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::CMOV_RFP32:
case X86::CMOV_RFP64:
case X86::CMOV_RFP80:
+ case X86::CMOV_VR64:
case X86::CMOV_VR128:
case X86::CMOV_VR128X:
case X86::CMOV_VR256:
case X86::CMOV_VR256X:
case X86::CMOV_VR512:
+ case X86::CMOV_VK1:
case X86::CMOV_VK2:
case X86::CMOV_VK4:
case X86::CMOV_VK8:
@@ -32327,7 +33161,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case X86::FP80_TO_INT64_IN_MEM: {
// Change the floating point control register to use "round towards zero"
// mode when truncating to an integer value.
- int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ int OrigCWFrameIdx =
+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL,
TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
@@ -32348,7 +33183,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
// Prepare memory for FLDCW.
- int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+ int NewCWFrameIdx =
+ MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
NewCWFrameIdx)
.addReg(NewCW16, RegState::Kill);
@@ -32483,6 +33319,97 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
BB->addLiveIn(BasePtr);
return BB;
}
+ case TargetOpcode::PREALLOCATED_SETUP: {
+ assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setHasPreallocatedCall(true);
+ int64_t PreallocatedId = MI.getOperand(0).getImm();
+ size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
+ assert(StackAdjustment != 0 && "0 stack adjustment");
+ LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
+ << StackAdjustment << "\n");
+ BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
+ .addReg(X86::ESP)
+ .addImm(StackAdjustment);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case TargetOpcode::PREALLOCATED_ARG: {
+ assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
+ int64_t PreallocatedId = MI.getOperand(1).getImm();
+ int64_t ArgIdx = MI.getOperand(2).getImm();
+ auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
+ LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
+ << ", arg offset " << ArgOffset << "\n");
+ // stack pointer + offset
+ addRegOffset(
+ BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
+ X86::ESP, false, ArgOffset);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case X86::PTDPBSSD:
+ case X86::PTDPBSUD:
+ case X86::PTDPBUSD:
+ case X86::PTDPBUUD:
+ case X86::PTDPBF16PS: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
+ case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
+ case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
+ case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
+ case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILEZERO: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Imm = MI.getOperand(0).getImm();
+ BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
+ case X86::PTILELOADD:
+ case X86::PTILELOADDT1:
+ case X86::PTILESTORED: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Opc;
+ switch (MI.getOpcode()) {
+ case X86::PTILELOADD: Opc = X86::TILELOADD; break;
+ case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
+ case X86::PTILESTORED: Opc = X86::TILESTORED; break;
+ }
+
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+ unsigned CurOp = 0;
+ if (Opc != X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Define);
+
+ MIB.add(MI.getOperand(CurOp++)); // base
+ MIB.add(MI.getOperand(CurOp++)); // scale
+ MIB.add(MI.getOperand(CurOp++)); // index -- stride
+ MIB.add(MI.getOperand(CurOp++)); // displacement
+ MIB.add(MI.getOperand(CurOp++)); // segment
+
+ if (Opc == X86::TILESTORED)
+ MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+ RegState::Undef);
+
+ MI.eraseFromParent(); // The pseudo is gone now.
+ return BB;
+ }
}
}
@@ -32492,20 +33419,53 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
bool
X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
- const APInt &Demanded,
+ const APInt &DemandedBits,
+ const APInt &DemandedElts,
TargetLoweringOpt &TLO) const {
- // Only optimize Ands to prevent shrinking a constant that could be
- // matched by movzx.
- if (Op.getOpcode() != ISD::AND)
- return false;
-
EVT VT = Op.getValueType();
+ unsigned Opcode = Op.getOpcode();
+ unsigned EltSize = VT.getScalarSizeInBits();
- // Ignore vectors.
- if (VT.isVector())
+ if (VT.isVector()) {
+ // If the constant is only all signbits in the active bits, then we should
+ // extend it to the entire constant to allow it act as a boolean constant
+ // vector.
+ auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
+ if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
+ return false;
+ for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
+ if (!DemandedElts[i] || V.getOperand(i).isUndef())
+ continue;
+ const APInt &Val = V.getConstantOperandAPInt(i);
+ if (Val.getBitWidth() > Val.getNumSignBits() &&
+ Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
+ return true;
+ }
+ return false;
+ };
+ // For vectors - if we have a constant, then try to sign extend.
+ // TODO: Handle AND/ANDN cases.
+ unsigned ActiveBits = DemandedBits.getActiveBits();
+ if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
+ (Opcode == ISD::OR || Opcode == ISD::XOR) &&
+ NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
+ EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
+ EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
+ VT.getVectorNumElements());
+ SDValue NewC =
+ TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
+ Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
+ SDValue NewOp =
+ TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
+ return TLO.CombineTo(Op, NewOp);
+ }
return false;
+ }
- unsigned Size = VT.getSizeInBits();
+ // Only optimize Ands to prevent shrinking a constant that could be
+ // matched by movzx.
+ if (Opcode != ISD::AND)
+ return false;
// Make sure the RHS really is a constant.
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
@@ -32515,7 +33475,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
const APInt &Mask = C->getAPIntValue();
// Clear all non-demanded bits initially.
- APInt ShrunkMask = Mask & Demanded;
+ APInt ShrunkMask = Mask & DemandedBits;
// Find the width of the shrunk mask.
unsigned Width = ShrunkMask.getActiveBits();
@@ -32527,10 +33487,10 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
// Find the next power of 2 width, rounding up to a byte.
Width = PowerOf2Ceil(std::max(Width, 8U));
// Truncate the width to size to handle illegal types.
- Width = std::min(Width, Size);
+ Width = std::min(Width, EltSize);
// Calculate a possible zero extend mask for this constant.
- APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
+ APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
// If we aren't changing the mask, just return true to keep it and prevent
// the caller from optimizing.
@@ -32539,7 +33499,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
// Make sure the new mask can be represented by a combination of mask bits
// and non-demanded bits.
- if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
+ if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
return false;
// Replace the constant with the zero extend mask.
@@ -32555,6 +33515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
const SelectionDAG &DAG,
unsigned Depth) const {
unsigned BitWidth = Known.getBitWidth();
+ unsigned NumElts = DemandedElts.getBitWidth();
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -32582,7 +33543,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
Op.getConstantOperandVal(1));
Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
- Known = Known.zextOrTrunc(BitWidth, false);
+ Known = Known.anyextOrTrunc(BitWidth);
Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
break;
}
@@ -32652,10 +33613,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
- // Output known-0 bits are only known if clear in both the LHS & RHS.
- Known.Zero &= Known2.Zero;
- // Output known-1 are known to be set if set in either the LHS | RHS.
- Known.One |= Known2.One;
+ Known |= Known2;
break;
}
case X86ISD::PSADBW: {
@@ -32679,6 +33637,76 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.Zero &= Known2.Zero;
break;
}
+ case X86ISD::BEXTR: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+ unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+ // If the length is 0, the result is 0.
+ if (Length == 0) {
+ Known.setAllZero();
+ break;
+ }
+
+ if ((Shift + Length) <= BitWidth) {
+ Known = DAG.computeKnownBits(Op0, Depth + 1);
+ Known = Known.extractBits(Length, Shift);
+ Known = Known.zextOrTrunc(BitWidth);
+ }
+ }
+ break;
+ }
+ case X86ISD::CVTSI2P:
+ case X86ISD::CVTUI2P:
+ case X86ISD::CVTP2SI:
+ case X86ISD::CVTP2UI:
+ case X86ISD::MCVTP2SI:
+ case X86ISD::MCVTP2UI:
+ case X86ISD::CVTTP2SI:
+ case X86ISD::CVTTP2UI:
+ case X86ISD::MCVTTP2SI:
+ case X86ISD::MCVTTP2UI:
+ case X86ISD::MCVTSI2P:
+ case X86ISD::MCVTUI2P:
+ case X86ISD::VFPROUND:
+ case X86ISD::VMFPROUND:
+ case X86ISD::CVTPS2PH:
+ case X86ISD::MCVTPS2PH: {
+ // Conversions - upper elements are known zero.
+ EVT SrcVT = Op.getOperand(0).getValueType();
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ if (NumElts > NumSrcElts &&
+ DemandedElts.countTrailingZeros() >= NumSrcElts)
+ Known.setAllZero();
+ }
+ break;
+ }
+ case X86ISD::STRICT_CVTTP2SI:
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::STRICT_CVTSI2P:
+ case X86ISD::STRICT_CVTUI2P:
+ case X86ISD::STRICT_VFPROUND:
+ case X86ISD::STRICT_CVTPS2PH: {
+ // Strict Conversions - upper elements are known zero.
+ EVT SrcVT = Op.getOperand(1).getValueType();
+ if (SrcVT.isVector()) {
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ if (NumElts > NumSrcElts &&
+ DemandedElts.countTrailingZeros() >= NumSrcElts)
+ Known.setAllZero();
+ }
+ break;
+ }
+ case X86ISD::MOVQ2DQ: {
+ // Move from MMX to XMM. Upper half of XMM should be 0.
+ if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
+ Known.setAllZero();
+ break;
+ }
}
// Handle target shuffles.
@@ -32745,11 +33773,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
return VTBits;
case X86ISD::VTRUNC: {
- // TODO: Add DemandedElts support.
SDValue Src = Op.getOperand(0);
- unsigned NumSrcBits = Src.getScalarValueSizeInBits();
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
assert(VTBits < NumSrcBits && "Illegal truncation input type");
- unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
if (Tmp > (NumSrcBits - VTBits))
return Tmp - (NumSrcBits - VTBits);
return 1;
@@ -32877,6 +33906,21 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
return N;
}
+// Helper to look for a normal load that can be narrowed into a vzload with the
+// specified VT and memory VT. Returns SDValue() on failure.
+static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
+ SelectionDAG &DAG) {
+ // Can't if the load is volatile or atomic.
+ if (!LN->isSimple())
+ return SDValue();
+
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+}
+
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
@@ -33021,9 +34065,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned InputSizeInBits = MaskVT.getSizeInBits();
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
-
- bool ContainsZeros =
- llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+ bool ContainsZeros = isAnyZero(Mask);
// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
if (!ContainsZeros && MaskScalarSizeInBits == 64) {
@@ -33071,7 +34113,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Narrow the repeated mask to create 32-bit element permutes.
SmallVector<int, 4> WordMask = RepeatedMask;
if (MaskScalarSizeInBits == 64)
- scaleShuffleMask<int>(2, RepeatedMask, WordMask);
+ narrowShuffleMaskElts(2, RepeatedMask, WordMask);
Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
@@ -33114,17 +34156,32 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Attempt to match against byte/bit shifts.
- // FIXME: Add 512-bit support.
- if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ if (AllowIntDomain &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
Mask, 0, Zeroable, Subtarget);
- if (0 < ShiftAmt) {
+ if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
+ 32 <= ShuffleVT.getScalarSizeInBits())) {
PermuteImm = (unsigned)ShiftAmt;
return true;
}
}
+ // Attempt to match against bit rotates.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
+ ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
+ Subtarget.hasAVX512())) {
+ int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
+ Subtarget, Mask);
+ if (0 < RotateAmt) {
+ Shuffle = X86ISD::VROTLI;
+ PermuteImm = (unsigned)RotateAmt;
+ return true;
+ }
+ }
+
return false;
}
@@ -33205,9 +34262,29 @@ static bool matchBinaryPermuteShuffle(
unsigned NumMaskElts = Mask.size();
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+ // Attempt to match against VALIGND/VALIGNQ rotate.
+ if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
+ ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
+ if (!isAnyZero(Mask)) {
+ int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
+ if (0 < Rotation) {
+ Shuffle = X86ISD::VALIGN;
+ if (EltSizeInBits == 64)
+ ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
+ else
+ ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
+ PermuteImm = Rotation;
+ return true;
+ }
+ }
+ }
+
// Attempt to match against PALIGNR byte rotate.
if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+ (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
if (0 < ByteRotation) {
Shuffle = X86ISD::PALIGNR;
@@ -33257,8 +34334,7 @@ static bool matchBinaryPermuteShuffle(
// Attempt to combine to INSERTPS, but only if it has elements that need to
// be set to zero.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
- MaskVT.is128BitVector() &&
- llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
+ MaskVT.is128BitVector() && isAnyZero(Mask) &&
matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
ShuffleVT = MVT::v4f32;
@@ -33386,6 +34462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return DAG.getBitcast(RootVT, V1);
}
+ bool OptForSize = DAG.shouldOptForSize();
unsigned RootSizeInBits = RootVT.getSizeInBits();
unsigned NumRootElts = RootVT.getVectorNumElements();
unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
@@ -33396,11 +34473,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Don't combine if we are a AVX512/EVEX target and the mask element size
// is different from the root element size - this would prevent writemasks
// from being reused.
- // TODO - this currently prevents all lane shuffles from occurring.
- // TODO - check for writemasks usage instead of always preventing combining.
- // TODO - attempt to narrow Mask back to writemask size.
- bool IsEVEXShuffle =
- RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+ bool IsMaskedShuffle = false;
+ if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
+ if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
+ Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
+ IsMaskedShuffle = true;
+ }
+ }
+
+ // If we are shuffling a broadcast (and not introducing zeros) then
+ // we can just use the broadcast directly. This works for smaller broadcast
+ // elements as well as they already repeat across each mask element
+ if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
+ return DAG.getBitcast(RootVT, V1);
+ }
// Attempt to match a subvector broadcast.
// shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
@@ -33420,27 +34507,138 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
- // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
+ // Handle 128/256-bit lane shuffles of 512-bit vectors.
+ if (RootVT.is512BitVector() &&
+ (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
+ MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+
+ // If the upper subvectors are zeroable, then an extract+insert is more
+ // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
+ // to zero the upper subvectors.
+ if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+ return SDValue(); // Nothing to do!
+ assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
+ "Unexpected lane shuffle");
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
+ bool UseZero = isAnyZero(BaseMask);
+ Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
+ Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Narrow shuffle mask to v4x128.
+ SmallVector<int, 4> Mask;
+ assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
+ narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
+
+ // Try to lower to vshuf64x2/vshuf32x4.
+ auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
+ SDValue V1, SDValue V2, SelectionDAG &DAG) {
+ unsigned PermMask = 0;
+ // Insure elements came from the same Op.
+ SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
+ for (int i = 0; i < 4; ++i) {
+ assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
+ if (Mask[i] < 0)
+ continue;
+
+ SDValue Op = Mask[i] >= 4 ? V2 : V1;
+ unsigned OpIndex = i / 2;
+ if (Ops[OpIndex].isUndef())
+ Ops[OpIndex] = Op;
+ else if (Ops[OpIndex] != Op)
+ return SDValue();
+
+ // Convert the 128-bit shuffle mask selection values into 128-bit
+ // selection bits defined by a vshuf64x2 instruction's immediate control
+ // byte.
+ PermMask |= (Mask[i] % 4) << (i * 2);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
+ DAG.getBitcast(ShuffleVT, Ops[0]),
+ DAG.getBitcast(ShuffleVT, Ops[1]),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ };
+
+ // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
+ // doesn't work because our mask is for 128 bits and we don't have an MVT
+ // to match that.
+ bool PreferPERMQ =
+ UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
+ isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
+ isUndefOrInRange(Mask[3], 2, 4) &&
+ (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
+ (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
+
+ if (!isAnyZero(Mask) && !PreferPERMQ) {
+ if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
+ return DAG.getBitcast(RootVT, V);
+ }
+ }
// Handle 128-bit lane shuffles of 256-bit vectors.
- // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
- // we need to use the zeroing feature.
- // TODO - this should support binary shuffles.
- if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
- !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
- !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+ if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
+ MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+
+ // If the upper half is zeroable, then an extract+insert is more optimal
+ // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
+ // zero the upper half.
+ if (isUndefOrZero(BaseMask[1])) {
+ if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+ return SDValue(); // Nothing to do!
+ assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
+ Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+ DL, 256);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
return SDValue(); // Nothing to do!
- MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
- unsigned PermMask = 0;
- PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
- PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
-
- Res = DAG.getBitcast(ShuffleVT, V1);
- Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
- DAG.getUNDEF(ShuffleVT),
- DAG.getTargetConstant(PermMask, DL, MVT::i8));
- return DAG.getBitcast(RootVT, Res);
+
+ // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
+ // we need to use the zeroing feature.
+ // Prefer blends for sequential shuffles unless we are optimizing for size.
+ if (UnaryShuffle &&
+ !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
+ (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+ PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+ Res = DAG.getBitcast(ShuffleVT, V1);
+ Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+ DAG.getUNDEF(ShuffleVT),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+ return SDValue(); // Nothing to do!
+
+ // TODO - handle AVX512VL cases with X86ISD::SHUF128.
+ if (!UnaryShuffle && !IsMaskedShuffle) {
+ assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
+ "Unexpected shuffle sentinel value");
+ // Prefer blends to X86ISD::VPERM2X128.
+ if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
+ (BaseMask[0] == 2 && BaseMask[1] == 1))) {
+ unsigned PermMask = 0;
+ PermMask |= ((BaseMask[0] & 3) << 0);
+ PermMask |= ((BaseMask[1] & 3) << 4);
+
+ Res = DAG.getNode(
+ X86ISD::VPERM2X128, DL, ShuffleVT,
+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
+ DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
+ DAG.getTargetConstant(PermMask, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
}
// For masks that have been widened to 128-bit elements or more,
@@ -33449,9 +34647,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (BaseMaskEltSizeInBits > 64) {
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
int MaskScale = BaseMaskEltSizeInBits / 64;
- scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
+ narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
} else {
- Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
+ Mask.assign(BaseMask.begin(), BaseMask.end());
+ }
+
+ // For masked shuffles, we're trying to match the root width for better
+ // writemask folding, attempt to scale the mask.
+ // TODO - variable shuffles might need this to be widened again.
+ if (IsMaskedShuffle && NumRootElts > Mask.size()) {
+ assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
+ int MaskScale = NumRootElts / Mask.size();
+ SmallVector<int, 64> ScaledMask;
+ narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
+ Mask = std::move(ScaledMask);
}
unsigned NumMaskElts = Mask.size();
@@ -33484,26 +34693,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
APInt Zeroable = KnownUndef | KnownZero;
if (UnaryShuffle) {
- // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
- // directly if we don't shuffle the lower element and we shuffle the upper
- // (zero) elements within themselves.
- if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
- (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
- MaskEltSizeInBits) == 0) {
- unsigned Scale =
- cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
- MaskEltSizeInBits;
- ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
- if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
- isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
- return DAG.getBitcast(RootVT, V1);
- }
- }
-
// Attempt to match against broadcast-from-vector.
// Limit AVX1 to cases where we're loading+broadcasting a scalar element.
- if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
- && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
+ if ((Subtarget.hasAVX2() ||
+ (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
+ (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
if (V1.getValueType() == MaskVT &&
@@ -33529,7 +34723,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ (!IsMaskedShuffle ||
+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
@@ -33540,7 +34735,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
PermuteImm) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ (!IsMaskedShuffle ||
+ (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
Res = DAG.getBitcast(ShuffleVT, V1);
@@ -33550,12 +34746,31 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Attempt to combine to INSERTPS, but only if the inserted element has come
+ // from a scalar.
+ // TODO: Handle other insertions here as well?
+ if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
+ MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
+ !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
+ SDValue SrcV1 = V1, SrcV2 = V2;
+ if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
+ SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+ return SDValue(); // Nothing to do!
+ Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, SrcV1),
+ DAG.getBitcast(MVT::v4f32, SrcV2),
+ DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
SDValue NewV1 = V1; // Save operands in case early exit happens.
SDValue NewV2 = V2;
if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT, UnaryShuffle) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
@@ -33566,10 +34781,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
NewV1 = V1; // Save operands in case early exit happens.
NewV2 = V2;
- if (matchBinaryPermuteShuffle(
- MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
- NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
- (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+ if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+ AllowIntDomain, NewV1, NewV2, DL, DAG,
+ Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
+ (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 0 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
@@ -33609,6 +34824,44 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Match shuffle against TRUNCATE patterns.
+ if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
+ // Match against a VTRUNC instruction, accounting for src/dst sizes.
+ if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
+ Subtarget)) {
+ bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
+ ShuffleSrcVT.getVectorNumElements();
+ unsigned Opc =
+ IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
+ if (Depth == 0 && Root.getOpcode() == Opc)
+ return SDValue(); // Nothing to do!
+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+ Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
+ if (ShuffleVT.getSizeInBits() < RootSizeInBits)
+ Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Do we need a more general binary truncation pattern?
+ if (RootSizeInBits < 512 &&
+ ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
+ (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
+ (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
+ isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
+ if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+ return SDValue(); // Nothing to do!
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+ V2 = DAG.getBitcast(ShuffleSrcVT, V2);
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
+ Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 1)
@@ -33618,8 +34871,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
- bool MaskContainsZeros =
- any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+ bool MaskContainsZeros = isAnyZero(Mask);
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
@@ -33714,7 +34966,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
Res = DAG.getBitcast(MaskVT, V1);
unsigned AndOpcode =
- FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
+ MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
return DAG.getBitcast(RootVT, Res);
}
@@ -33791,7 +35043,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
continue;
}
if (M == SM_SentinelZero) {
- PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+ PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
continue;
}
M = Ratio * M + i % Ratio;
@@ -33822,7 +35074,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
continue;
}
if (M == SM_SentinelZero) {
- VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
+ VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
continue;
}
M = Ratio * M + i % Ratio;
@@ -33897,8 +35149,7 @@ static SDValue combineX86ShuffleChainWithExtract(
unsigned &Offset = Offsets[i];
Src = peekThroughBitcasts(Src);
EVT BaseVT = Src.getValueType();
- while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- isa<ConstantSDNode>(Src.getOperand(1))) {
+ while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
Offset += Src.getConstantOperandVal(1);
Src = Src.getOperand(0);
}
@@ -34121,7 +35372,8 @@ static SDValue combineX86ShufflesRecursively(
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
- assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+ unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+ assert(VT.getSizeInBits() == RootSizeInBits &&
"Can only combine shuffles of the same vector register size.");
// Extract target shuffle mask and resolve sentinels and inputs.
@@ -34135,6 +35387,18 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
+ // Shuffle inputs must be the same size as the result, bail on any larger
+ // inputs and widen any smaller inputs.
+ if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
+ return Op.getValueSizeInBits() > RootSizeInBits;
+ }))
+ return SDValue();
+
+ for (SDValue &Op : OpInputs)
+ if (Op.getValueSizeInBits() < RootSizeInBits)
+ Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
+ SDLoc(Op), RootSizeInBits);
+
SmallVector<int, 64> Mask;
SmallVector<SDValue, 16> Ops;
@@ -34535,6 +35799,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
return V;
}
+// Attempt to commute shufps LHS loads:
+// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
+static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
+ SelectionDAG &DAG) {
+ // TODO: Add vXf64 support.
+ if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
+ return SDValue();
+
+ // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
+ auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
+ if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
+ return SDValue();
+ SDValue N0 = V.getOperand(0);
+ SDValue N1 = V.getOperand(1);
+ unsigned Imm = V.getConstantOperandVal(2);
+ if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
+ MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+ return SDValue();
+ Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
+ DAG.getTargetConstant(Imm, DL, MVT::i8));
+ };
+
+ switch (N.getOpcode()) {
+ case X86ISD::VPERMILPI:
+ if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
+ unsigned Imm = N.getConstantOperandVal(1);
+ return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ }
+ break;
+ case X86ISD::SHUFP: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (N0 == N1) {
+ if (SDValue NewSHUFP = commuteSHUFP(N, N0))
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
+ DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
+ } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
+ DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
+ }
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -34544,35 +35861,105 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
+ bool IsUnary;
+ SmallVector<int, 64> TargetMask;
+ SmallVector<SDValue, 2> TargetOps;
+ if (isTargetShuffle(Opcode))
+ getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);
+
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
- // single instruction.
- if (VT.getScalarSizeInBits() == 64 &&
- (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
- Opcode == X86ISD::UNPCKL)) {
- auto BC0 = peekThroughBitcasts(N.getOperand(0));
- auto BC1 = peekThroughBitcasts(N.getOperand(1));
- EVT VT0 = BC0.getValueType();
- EVT VT1 = BC1.getValueType();
- unsigned Opcode0 = BC0.getOpcode();
- unsigned Opcode1 = BC1.getOpcode();
- if (Opcode0 == Opcode1 && VT0 == VT1 &&
- (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
- Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
- SDValue Lo, Hi;
- if (Opcode == X86ISD::MOVSD) {
- Lo = BC1.getOperand(0);
- Hi = BC0.getOperand(1);
- } else {
- Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
- Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
+ // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+ // represents the LHS/RHS inputs for the lower/upper halves.
+ SmallVector<int, 16> TargetMask128;
+ if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
+ isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
+ SmallVector<int, 16> WidenedMask128 = TargetMask128;
+ while (WidenedMask128.size() > 2) {
+ SmallVector<int, 16> WidenedMask;
+ if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
+ break;
+ WidenedMask128 = std::move(WidenedMask);
+ }
+ if (WidenedMask128.size() == 2) {
+ assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
+ SDValue BC0 = peekThroughBitcasts(TargetOps.front());
+ SDValue BC1 = peekThroughBitcasts(TargetOps.back());
+ EVT VT0 = BC0.getValueType();
+ EVT VT1 = BC1.getValueType();
+ unsigned Opcode0 = BC0.getOpcode();
+ unsigned Opcode1 = BC1.getOpcode();
+ bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+ if (Opcode0 == Opcode1 && VT0 == VT1 &&
+ (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
+ bool SingleOp = (TargetOps.size() == 1);
+ if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+ SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
+ SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
+ Lo = Lo.getOperand(WidenedMask128[0] & 1);
+ Hi = Hi.getOperand(WidenedMask128[1] & 1);
+ if (SingleOp) {
+ MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+ SDValue Undef = DAG.getUNDEF(SrcVT);
+ SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+ Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
+ Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
+ Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
+ Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
+ }
+ SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ return DAG.getBitcast(VT, Horiz);
+ }
}
- SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
- return DAG.getBitcast(VT, Horiz);
}
}
+ if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
+ return R;
+
+ // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
+ // help expose the 'NOT' pattern further up the DAG.
+ // TODO: This might be beneficial for any binop with a 'splattable' operand.
+ switch (Opcode) {
+ case X86ISD::MOVDDUP:
+ case X86ISD::PSHUFD: {
+ SDValue Src = N.getOperand(0);
+ if (Src.hasOneUse() && Src.getValueType() == VT) {
+ if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
+ Not = DAG.getBitcast(VT, Not);
+ Not = Opcode == X86ISD::MOVDDUP
+ ? DAG.getNode(Opcode, DL, VT, Not)
+ : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
+ EVT IntVT = Not.getValueType().changeTypeToInteger();
+ SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
+ Not = DAG.getBitcast(IntVT, Not);
+ Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
+ return DAG.getBitcast(VT, Not);
+ }
+ }
+ break;
+ }
+ }
+
+ // Handle specific target shuffles.
switch (Opcode) {
+ case X86ISD::MOVDDUP: {
+ SDValue Src = N.getOperand(0);
+ // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
+ if (VT == MVT::v2f64 && Src.hasOneUse() &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
+ SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
+ DCI.CombineTo(N.getNode(), Movddup);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ return SDValue();
+ }
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);
SDValue BC = peekThroughBitcasts(Src);
@@ -34598,7 +35985,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
// broadcast(bitcast(src)) -> bitcast(broadcast(src))
// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
if (Src.getOpcode() == ISD::BITCAST &&
- SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
+ SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
VT.getVectorNumElements());
return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
@@ -34645,6 +36033,190 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return N; // Return N so it doesn't get rechecked!
}
+ // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
+ // i16. So shrink it ourselves if we can make a broadcast_load.
+ if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
+ assert(Subtarget.hasAVX2() && "Expected AVX2");
+ SDValue TruncIn = Src.getOperand(0);
+
+ // If this is a truncate of a non extending load we can just narrow it to
+ // use a broadcast_load.
+ if (ISD::isNormalLoad(TruncIn.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
+ // Unless its volatile or atomic.
+ if (LN->isSimple()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of an i16 extload, we can directly replace it.
+ if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
+ ISD::isEXTLoad(Src.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
+ if (LN->getMemoryVT().getSizeInBits() == 16) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // If this is a truncate of load that has been shifted right, we can
+ // offset the pointer and use a narrower load.
+ if (TruncIn.getOpcode() == ISD::SRL &&
+ TruncIn.getOperand(0).hasOneUse() &&
+ isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
+ ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
+ unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
+ // Make sure the shift amount and the load size are divisible by 16.
+ // Don't do this if the load is volatile or atomic.
+ if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
+ LN->isSimple()) {
+ unsigned Offset = ShiftAmt / 8;
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
+ SDValue Ops[] = { LN->getChain(), Ptr };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+ LN->getPointerInfo().getWithOffset(Offset),
+ LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+ }
+
+ // vbroadcast(vzload X) -> vbroadcast_load X
+ if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
+ MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
+ if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ // vbroadcast(vector load X) -> vbroadcast_load
+ if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
+ ISD::isNormalLoad(Src.getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src);
+ // Unless the load is volatile or atomic.
+ if (LN->isSimple()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+ LN->getPointerInfo(), LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DCI.CombineTo(N.getNode(), BcastLd);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N; // Return N so it doesn't get rechecked!
+ }
+ }
+
+ return SDValue();
+ }
+ case X86ISD::VZEXT_MOVL: {
+ SDValue N0 = N.getOperand(0);
+
+ // If this a vzmovl of a full vector load, replace it with a vzload, unless
+ // the load is volatile.
+ if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
+ auto *LN = cast<LoadSDNode>(N0);
+ if (SDValue VZLoad =
+ narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
+ DCI.CombineTo(N.getNode(), VZLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N;
+ }
+ }
+
+ // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
+ // and can just use a VZEXT_LOAD.
+ // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
+ if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *LN = cast<MemSDNode>(N0);
+ if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
+ LN->getMemoryVT(), LN->getMemOperand());
+ DCI.CombineTo(N.getNode(), VZLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return N;
+ }
+ }
+
+ // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
+ // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
+ // if the upper bits of the i64 are zero.
+ if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N0.getOperand(0).hasOneUse() &&
+ N0.getOperand(0).getValueType() == MVT::i64) {
+ SDValue In = N0.getOperand(0);
+ APInt Mask = APInt::getHighBitsSet(64, 32);
+ if (DAG.MaskedValueIsZero(In, Mask)) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
+ MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+ SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
+ return DAG.getBitcast(VT, Movl);
+ }
+ }
+
+ // Load a scalar integer constant directly to XMM instead of transferring an
+ // immediate value from GPR.
+ // vzext_movl (scalar_to_vector C) --> load [C,0...]
+ if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+ // Create a vector constant - scalar constant followed by zeros.
+ EVT ScalarVT = N0.getOperand(0).getValueType();
+ Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
+ unsigned NumElts = VT.getVectorNumElements();
+ Constant *Zero = ConstantInt::getNullValue(ScalarTy);
+ SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
+ ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
+
+ // Load the vector constant from constant pool.
+ MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+ Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+ return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
+ MachineMemOperand::MOLoad);
+ }
+ }
+
return SDValue();
}
case X86ISD::BLENDI: {
@@ -34685,6 +36257,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
return SDValue();
}
+ case X86ISD::VPERM2X128: {
+ // If both 128-bit values were inserted into high halves of 256-bit values,
+ // the shuffle can be reduced to a concatenation of subvectors:
+ // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
+ // Note: We are only looking for the exact high/high shuffle mask because we
+ // expect to fold other similar patterns before creating this opcode.
+ SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
+ SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
+ unsigned Imm = N.getConstantOperandVal(2);
+ if (!(Imm == 0x31 &&
+ Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Ins0.getValueType() == Ins1.getValueType()))
+ return SDValue();
+
+ SDValue X = Ins0.getOperand(1);
+ SDValue Y = Ins1.getOperand(1);
+ unsigned C1 = Ins0.getConstantOperandVal(2);
+ unsigned C2 = Ins1.getConstantOperandVal(2);
+ MVT SrcVT = X.getSimpleValueType();
+ unsigned SrcElts = SrcVT.getVectorNumElements();
+ if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
+ C1 != SrcElts || C2 != SrcElts)
+ return SDValue();
+
+ return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
+ Ins1.getValueType(), X, Y));
+ }
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
@@ -34724,8 +36324,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
SDValue Op0 = N.getOperand(0);
SDValue Op1 = N.getOperand(1);
- SDValue Op2 = N.getOperand(2);
- unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+ unsigned InsertPSMask = N.getConstantOperandVal(2);
unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
unsigned ZeroMask = InsertPSMask & 0xF;
@@ -34865,9 +36464,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
(V.getOpcode() == X86ISD::PSHUFLW ||
V.getOpcode() == X86ISD::PSHUFHW) &&
V.getOpcode() != N.getOpcode() &&
- V.hasOneUse()) {
+ V.hasOneUse() && V.getOperand(0).hasOneUse()) {
SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
- if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+ if (D.getOpcode() == X86ISD::PSHUFD) {
SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
@@ -35266,7 +36865,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
}
// Attempt to combine into a vector load/broadcast.
- if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
+ if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
+ Subtarget, true))
return LD;
// For AVX2, we sometimes want to combine
@@ -35299,79 +36899,100 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
- // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
- // in the upper 64 bits.
- // TODO: Can we generalize this using computeKnownBits.
- if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
- (VT == MVT::v2f64 || VT == MVT::v2i64) &&
- N->getOperand(0).getOpcode() == ISD::BITCAST &&
- (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
- N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
- SDValue In = N->getOperand(0).getOperand(0);
- switch (In.getOpcode()) {
- default:
- break;
- case X86ISD::CVTP2SI: case X86ISD::CVTP2UI:
- case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI:
- case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI:
- case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
- case X86ISD::CVTSI2P: case X86ISD::CVTUI2P:
- case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P:
- case X86ISD::VFPROUND: case X86ISD::VMFPROUND:
- if (In.getOperand(0).getValueType() == MVT::v2f64 ||
- In.getOperand(0).getValueType() == MVT::v2i64)
- return N->getOperand(0); // return the bitcast
- break;
- case X86ISD::STRICT_CVTTP2SI:
- case X86ISD::STRICT_CVTTP2UI:
- case X86ISD::STRICT_CVTSI2P:
- case X86ISD::STRICT_CVTUI2P:
- case X86ISD::STRICT_VFPROUND:
- if (In.getOperand(1).getValueType() == MVT::v2f64 ||
- In.getOperand(1).getValueType() == MVT::v2i64)
- return N->getOperand(0);
- break;
- }
- }
-
// Pull subvector inserts into undef through VZEXT_MOVL by making it an
// insert into a zero vector. This helps get VZEXT_MOVL closer to
// scalar_to_vectors where 256/512 are canonicalized to an insert and a
// 128-bit scalar_to_vector. This reduces the number of isel patterns.
if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
- N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
- N->getOperand(0).hasOneUse() &&
- N->getOperand(0).getOperand(0).isUndef() &&
- isNullConstant(N->getOperand(0).getOperand(2))) {
- SDValue In = N->getOperand(0).getOperand(1);
- SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
- getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
- Movl, N->getOperand(0).getOperand(2));
- }
-
- // If this a vzmovl of a full vector load, replace it with a vzload, unless
- // the load is volatile.
- if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
- ISD::isNormalLoad(N->getOperand(0).getNode())) {
- LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- if (LN->isSimple()) {
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
- SDValue VZLoad =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
- VT.getVectorElementType(),
- LN->getPointerInfo(),
- LN->getAlignment(),
- MachineMemOperand::MOLoad);
- DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
- return VZLoad;
+ N->getOperand(0).hasOneUse()) {
+ SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));
+
+ if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
+ SDValue In = V.getOperand(1);
+ MVT SubVT =
+ MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
+ In.getValueSizeInBits() / VT.getScalarSizeInBits());
+ In = DAG.getBitcast(SubVT, In);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+ Movl, V.getOperand(2));
}
}
return SDValue();
}
+// Simplify variable target shuffle masks based on the demanded elements.
+// TODO: Handle DemandedBits in mask indices as well?
+bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
+ SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
+ TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
+ // If we're demanding all elements don't bother trying to simplify the mask.
+ unsigned NumElts = DemandedElts.getBitWidth();
+ if (DemandedElts.isAllOnesValue())
+ return false;
+
+ SDValue Mask = Op.getOperand(MaskIndex);
+ if (!Mask.hasOneUse())
+ return false;
+
+ // Attempt to generically simplify the variable shuffle mask.
+ APInt MaskUndef, MaskZero;
+ if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+ Depth + 1))
+ return true;
+
+ // Attempt to extract+simplify a (constant pool load) shuffle mask.
+ // TODO: Support other types from getTargetShuffleMaskIndices?
+ SDValue BC = peekThroughOneUseBitcasts(Mask);
+ EVT BCVT = BC.getValueType();
+ auto *Load = dyn_cast<LoadSDNode>(BC);
+ if (!Load)
+ return false;
+
+ const Constant *C = getTargetConstantFromNode(Load);
+ if (!C)
+ return false;
+
+ Type *CTy = C->getType();
+ if (!CTy->isVectorTy() ||
+ CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
+ return false;
+
+ // Handle scaling for i64 elements on 32-bit targets.
+ unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
+ if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
+ return false;
+ unsigned Scale = NumCstElts / NumElts;
+
+ // Simplify mask if we have an undemanded element that is not undef.
+ bool Simplified = false;
+ SmallVector<Constant *, 32> ConstVecOps;
+ for (unsigned i = 0; i != NumCstElts; ++i) {
+ Constant *Elt = C->getAggregateElement(i);
+ if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
+ ConstVecOps.push_back(UndefValue::get(Elt->getType()));
+ Simplified = true;
+ continue;
+ }
+ ConstVecOps.push_back(Elt);
+ }
+ if (!Simplified)
+ return false;
+
+ // Generate new constant pool entry + legalize immediately for the load.
+ SDLoc DL(Op);
+ SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
+ SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
+ SDValue NewMask = TLO.DAG.getLoad(
+ BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
+ MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
+ Load->getAlign());
+ return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
+}
+
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
TargetLoweringOpt &TLO, unsigned Depth) const {
@@ -35541,12 +37162,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
// Aggressively peek through ops to get at the demanded elts.
// TODO - we should do this for all target/faux shuffles ops.
if (!DemandedElts.isAllOnesValue()) {
- APInt DemandedSrcBits =
- APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
- SDValue NewN0 = SimplifyMultipleUseDemandedBits(
- N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
- SDValue NewN1 = SimplifyMultipleUseDemandedBits(
- N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
+ SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
+ TLO.DAG, Depth + 1);
+ SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
+ TLO.DAG, Depth + 1);
if (NewN0 || NewN1) {
NewN0 = NewN0 ? NewN0 : N0;
NewN1 = NewN1 ? NewN1 : N1;
@@ -35608,6 +37227,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
KnownUndef = LHSUndef & RHSUndef;
break;
}
+ case X86ISD::VZEXT_MOVL: {
+ // If upper demanded elements are already zero then we have nothing to do.
+ SDValue Src = Op.getOperand(0);
+ APInt DemandedUpperElts = DemandedElts;
+ DemandedUpperElts.clearLowBits(1);
+ if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
+ return TLO.CombineTo(Op, Src);
+ break;
+ }
case X86ISD::VBROADCAST: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -35625,36 +37253,32 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
Depth + 1))
return true;
+ // Aggressively peek through src to get at the demanded elt.
+ // TODO - we should do this for all target/faux shuffles ops.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+ Src, SrcElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
break;
}
- case X86ISD::VPERMV: {
- SDValue Mask = Op.getOperand(0);
- APInt MaskUndef, MaskZero;
- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
- Depth + 1))
+ case X86ISD::VPERMV:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
+ Depth))
return true;
break;
- }
case X86ISD::PSHUFB:
case X86ISD::VPERMV3:
- case X86ISD::VPERMILPV: {
- SDValue Mask = Op.getOperand(1);
- APInt MaskUndef, MaskZero;
- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
- Depth + 1))
+ case X86ISD::VPERMILPV:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
+ Depth))
return true;
break;
- }
case X86ISD::VPPERM:
- case X86ISD::VPERMIL2: {
- SDValue Mask = Op.getOperand(2);
- APInt MaskUndef, MaskZero;
- if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
- Depth + 1))
+ case X86ISD::VPERMIL2:
+ if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
+ Depth))
return true;
break;
}
- }
// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
// demand any of the high elements, then narrow the op to 128/256-bits: e.g.
@@ -35669,18 +37293,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
ExtSizeInBits = SizeInBits / 4;
switch (Opc) {
- // Zero upper elements.
- case X86ISD::VZEXT_MOVL: {
- SDLoc DL(Op);
- SDValue Ext0 =
- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
- SDValue ExtOp =
- TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
- SDValue UndefVec = TLO.DAG.getUNDEF(VT);
- SDValue Insert =
- insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
- return TLO.CombineTo(Op, Insert);
- }
// Subvector broadcast.
case X86ISD::SUBV_BROADCAST: {
SDLoc DL(Op);
@@ -35733,10 +37345,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
break;
}
- // Target Shuffles.
+ // Zero upper elements.
+ case X86ISD::VZEXT_MOVL:
+ // Target unary shuffles by immediate:
+ case X86ISD::PSHUFD:
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ case X86ISD::VPERMILPI:
+ // (Non-Lane Crossing) Target Shuffles.
+ case X86ISD::VPERMILPV:
+ case X86ISD::VPERMIL2:
case X86ISD::PSHUFB:
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
+ case X86ISD::BLENDI:
// Saturated Packs.
case X86ISD::PACKSS:
case X86ISD::PACKUS:
@@ -35746,14 +37368,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::FHADD:
case X86ISD::FHSUB: {
SDLoc DL(Op);
+ SmallVector<SDValue, 4> Ops;
+ for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+ SDValue SrcOp = Op.getOperand(i);
+ EVT SrcVT = SrcOp.getValueType();
+ assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
+ "Unsupported vector size");
+ Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
+ ExtSizeInBits)
+ : SrcOp);
+ }
MVT ExtVT = VT.getSimpleVT();
ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
ExtSizeInBits / ExtVT.getScalarSizeInBits());
- SDValue Ext0 =
- extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
- SDValue Ext1 =
- extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
- SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
+ SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
SDValue UndefVec = TLO.DAG.getUNDEF(VT);
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
@@ -35850,6 +37478,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
unsigned BitWidth = OriginalDemandedBits.getBitWidth();
unsigned Opc = Op.getOpcode();
switch(Opc) {
+ case X86ISD::VTRUNC: {
+ KnownBits KnownOp;
+ SDValue Src = Op.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ // Simplify the input, using demanded bit information.
+ APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
+ APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
+ if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
+ return true;
+ break;
+ }
case X86ISD::PMULDQ:
case X86ISD::PMULUDQ: {
// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
@@ -35906,6 +37546,14 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
}
+ // If we are only demanding sign bits then we can use the shift source directly.
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
+ unsigned UpperDemandedBits =
+ BitWidth - OriginalDemandedBits.countTrailingZeros();
+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+ return TLO.CombineTo(Op, Op0);
+
if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
TLO, Depth + 1))
return true;
@@ -36019,7 +37667,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
return TLO.CombineTo(
Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
- Known = KnownVec.zext(BitWidth, true);
+ Known = KnownVec.zext(BitWidth);
return false;
}
break;
@@ -36072,6 +37720,17 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
KnownRHS, TLO, Depth + 1))
return true;
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+ Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
+ SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+ Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
+ if (DemandedOp0 || DemandedOp1) {
+ SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
+ SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
+ }
}
// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
break;
@@ -36104,16 +37763,51 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
// MOVMSK only uses the MSB from each vector element.
KnownBits KnownSrc;
- if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
- KnownSrc, TLO, Depth + 1))
+ APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+ if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+ Depth + 1))
return true;
if (KnownSrc.One[SrcBits - 1])
Known.One.setLowBits(NumElts);
else if (KnownSrc.Zero[SrcBits - 1])
Known.Zero.setLowBits(NumElts);
+
+ // Attempt to avoid multi-use os if we don't need anything from it.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+ Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
return false;
}
+ case X86ISD::BEXTR: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Only bottom 16-bits of the control bits are required.
+ if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+ // NOTE: SimplifyDemandedBits won't do this for constants.
+ const APInt &Val1 = Cst1->getAPIntValue();
+ APInt MaskedVal1 = Val1 & 0xFFFF;
+ if (MaskedVal1 != Val1) {
+ SDLoc DL(Op);
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
+ TLO.DAG.getConstant(MaskedVal1, DL, VT)));
+ }
+ }
+
+ KnownBits Known1;
+ APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
+ if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+ return true;
+
+ // If the length is 0, replace with 0.
+ KnownBits LengthBits = Known1.extractBits(8, 8);
+ if (LengthBits.isZero())
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+ break;
+ }
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -36137,8 +37831,26 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
!DemandedElts[CIdx->getZExtValue()])
return Vec;
- break;
+ break;
+ }
+ case X86ISD::VSHLI: {
+ // If we are only demanding sign bits then we can use the shift source
+ // directly.
+ SDValue Op0 = Op.getOperand(0);
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ unsigned BitWidth = DemandedBits.getBitWidth();
+ unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+ unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
+ if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+ return Op0;
+ break;
}
+ case X86ISD::VSRAI:
+ // iff we only need the sign bit then we can use the source directly.
+ // TODO: generalize where we only demand extended signbits.
+ if (DemandedBits.isSignMask())
+ return Op.getOperand(0);
+ break;
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
// iff we only need the sign bit then we can use R directly.
@@ -36172,13 +37884,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
int M = ShuffleMask[i];
if (!DemandedElts[i] || ShuffleUndef[i])
continue;
- int Op = M / NumElts;
- int Index = M % NumElts;
- if (M < 0 || Index != i) {
+ int OpIdx = M / NumElts;
+ int EltIdx = M % NumElts;
+ if (M < 0 || EltIdx != i) {
IdentityOp.clearAllBits();
break;
}
- IdentityOp &= APInt::getOneBitSet(NumOps, Op);
+ IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
if (IdentityOp == 0)
break;
}
@@ -36209,6 +37921,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
return false;
}
+// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
+static unsigned getAltBitOpcode(unsigned Opcode) {
+ switch(Opcode) {
+ case ISD::AND: return X86ISD::FAND;
+ case ISD::OR: return X86ISD::FOR;
+ case ISD::XOR: return X86ISD::FXOR;
+ case X86ISD::ANDNP: return X86ISD::FANDN;
+ }
+ llvm_unreachable("Unknown bitwise opcode");
+}
+
+// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
+static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
+ const SDLoc &DL) {
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT != MVT::v4i1)
+ return SDValue();
+
+ switch (Src.getOpcode()) {
+ case ISD::SETCC:
+ if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
+ SDValue Op0 = Src.getOperand(0);
+ if (ISD::isNormalLoad(Op0.getNode()))
+ return DAG.getBitcast(MVT::v4f32, Op0);
+ if (Op0.getOpcode() == ISD::BITCAST &&
+ Op0.getOperand(0).getValueType() == MVT::v4f32)
+ return Op0.getOperand(0);
+ }
+ break;
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR: {
+ SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
+ SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
+ if (Op0 && Op1)
+ return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
+ Op1);
+ break;
+ }
+ }
+ return SDValue();
+}
+
// Helper to push sign extension of vXi1 SETCC result through bitops.
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
SDValue Src, const SDLoc &DL) {
@@ -36239,18 +37996,40 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
return SDValue();
+ // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
+ // legalization destroys the v4i32 type.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
+ if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
+ V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
+ DAG.getBitcast(MVT::v4f32, V));
+ return DAG.getZExtOrTrunc(V, DL, VT);
+ }
+ }
+
// If the input is a truncate from v16i8 or v32i8 go ahead and use a
// movmskb even with avx512. This will be better than truncating to vXi1 and
// using a kmov. This can especially help KNL if the input is a v16i8/v32i8
// vpcmpeqb/vpcmpgtb.
- bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
- (Src.getOperand(0).getValueType() == MVT::v16i8 ||
- Src.getOperand(0).getValueType() == MVT::v32i8 ||
- Src.getOperand(0).getValueType() == MVT::v64i8);
+ bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
+ (Src.getOperand(0).getValueType() == MVT::v16i8 ||
+ Src.getOperand(0).getValueType() == MVT::v32i8 ||
+ Src.getOperand(0).getValueType() == MVT::v64i8);
+
+ // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
+ // directly with vpmovmskb/vmovmskps/vmovmskpd.
+ if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
+ cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
+ ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
+ EVT CmpVT = Src.getOperand(0).getValueType();
+ EVT EltVT = CmpVT.getVectorElementType();
+ if (CmpVT.getSizeInBits() <= 256 &&
+ (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
+ PreferMovMsk = true;
+ }
// With AVX512 vxi1 types are legal and we prefer using k-regs.
// MOVMSK is supported in SSE2 or later.
- if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
+ if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
return SDValue();
// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
@@ -36306,7 +38085,14 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
case MVT::v64i1:
// If we have AVX512F, but not AVX512BW and the input is truncated from
// v64i8 checked earlier. Then split the input and make two pmovmskbs.
- if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
+ if (Subtarget.hasAVX512()) {
+ if (Subtarget.hasBWI())
+ return SDValue();
+ SExtVT = MVT::v64i8;
+ break;
+ }
+ // Split if this is a <64 x i8> comparison result.
+ if (checkBitcastSrcVectorSize(Src, 512)) {
SExtVT = MVT::v64i8;
break;
}
@@ -36476,6 +38262,74 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
return Ops[0];
}
+// Recursive function that attempts to find if a bool vector node was originally
+// a vector/float/double that got truncated/extended/bitcast to/from a scalar
+// integer. If so, replace the scalar ops with bool vector equivalents back down
+// the chain.
+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned Opc = V.getOpcode();
+ switch (Opc) {
+ case ISD::BITCAST: {
+ // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
+ SDValue Src = V.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (SrcVT.isVector() || SrcVT.isFloatingPoint())
+ return DAG.getBitcast(VT, Src);
+ break;
+ }
+ case ISD::TRUNCATE: {
+ // If we find a suitable source, a truncated scalar becomes a subvector.
+ SDValue Src = V.getOperand(0);
+ EVT NewSrcVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
+ if (TLI.isTypeLegal(NewSrcVT))
+ if (SDValue N0 =
+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
+ DAG.getIntPtrConstant(0, DL));
+ break;
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ // If we find a suitable source, an extended scalar becomes a subvector.
+ SDValue Src = V.getOperand(0);
+ EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ Src.getScalarValueSizeInBits());
+ if (TLI.isTypeLegal(NewSrcVT))
+ if (SDValue N0 =
+ combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+ Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
+ : DAG.getConstant(0, DL, VT),
+ N0, DAG.getIntPtrConstant(0, DL));
+ break;
+ }
+ case ISD::OR: {
+ // If we find suitable sources, we can just move an OR to the vector domain.
+ SDValue Src0 = V.getOperand(0);
+ SDValue Src1 = V.getOperand(1);
+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+ if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
+ return DAG.getNode(Opc, DL, VT, N0, N1);
+ break;
+ }
+ case ISD::SHL: {
+ // If we find a suitable source, a SHL becomes a KSHIFTL.
+ SDValue Src0 = V.getOperand(0);
+ if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
+ if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+ return DAG.getNode(
+ X86ISD::KSHIFTL, DL, VT, N0,
+ DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
+ break;
+ }
+ }
+ return SDValue();
+}
+
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -36494,24 +38348,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
return V;
- // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
- // legalization destroys the v4i32 type.
- if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
- VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
- N0.getOperand(0).getValueType() == MVT::v4i32 &&
- ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
- cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
- SDValue N00 = N0.getOperand(0);
- // Only do this if we can avoid scalarizing the input.
- if (ISD::isNormalLoad(N00.getNode()) ||
- (N00.getOpcode() == ISD::BITCAST &&
- N00.getOperand(0).getValueType() == MVT::v4f32)) {
- SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
- DAG.getBitcast(MVT::v4f32, N00));
- return DAG.getZExtOrTrunc(V, dl, VT);
- }
- }
-
// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
// type, widen both sides to avoid a trip through memory.
if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
@@ -36553,6 +38389,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
N0 = DAG.getBitcast(MVT::i8, N0);
return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
}
+ } else {
+ // If we're bitcasting from iX to vXi1, see if the integer originally
+ // began as a vXi1 and whether we can remove the bitcast entirely.
+ if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
+ SrcVT.isScalarInteger() &&
+ DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ if (SDValue V =
+ combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
+ return V;
+ }
}
// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
@@ -36567,19 +38413,30 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
DAG.getBitcast(MVT::i16, N0.getOperand(0)));
- // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
- // determines // the number of bits loaded. Remaining bits are zero.
+ // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
+ // and the vbroadcast_load are both integer or both fp. In some cases this
+ // will remove the bitcast entirely.
if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
- VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
+ VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
auto *BCast = cast<MemIntrinsicSDNode>(N0);
- SDVTList Tys = DAG.getVTList(VT, MVT::Other);
- SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
- SDValue ResNode =
- DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
- VT.getVectorElementType(),
- BCast->getMemOperand());
- DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
- return ResNode;
+ unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
+ unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
+ // Don't swap i8/i16 since don't have fp types that size.
+ if (MemSize >= 32) {
+ MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
+ : MVT::getIntegerVT(MemSize);
+ MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
+ : MVT::getIntegerVT(SrcVTSize);
+ LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
+
+ SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+ SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+ SDValue ResNode =
+ DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+ MemVT, BCast->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
+ return DAG.getBitcast(VT, ResNode);
+ }
}
// Since MMX types are special and don't usually play with other vector types,
@@ -36666,6 +38523,47 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(0, SDLoc(N0), VT);
}
+ // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
+ // Turn it into a sign bit compare that produces a k-register. This avoids
+ // a trip through a GPR.
+ if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
+ VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+ isPowerOf2_32(VT.getVectorNumElements())) {
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue Src = N0;
+
+ // Peek through truncate.
+ if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
+ Src = N0.getOperand(0);
+
+ if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
+ SDValue MovmskIn = Src.getOperand(0);
+ MVT MovmskVT = MovmskIn.getSimpleValueType();
+ unsigned MovMskElts = MovmskVT.getVectorNumElements();
+
+ // We allow extra bits of the movmsk to be used since they are known zero.
+ // We can't convert a VPMOVMSKB without avx512bw.
+ if (MovMskElts <= NumElts &&
+ (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
+ EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
+ MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
+ SDLoc dl(N);
+ MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
+ SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
+ DAG.getConstant(0, dl, IntVT), ISD::SETLT);
+ if (EVT(CmpVT) == VT)
+ return Cmp;
+
+ // Pad with zeroes up to original VT to replace the zeroes that were
+ // being used from the MOVMSK.
+ unsigned NumConcats = NumElts / MovMskElts;
+ SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
+ Ops[0] = Cmp;
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
+ }
+ }
+ }
+
// Try to remove bitcasts from input and output of mask arithmetic to
// remove GPR<->K-register crossings.
if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
@@ -36790,12 +38688,9 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
while (SrcVT.getSizeInBits() > 128) {
- unsigned NumElts = SrcVT.getVectorNumElements();
- unsigned NumSubElts = NumElts / 2;
- SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
- unsigned SubSizeInBits = SrcVT.getSizeInBits();
- SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
- SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
+ SrcVT = Lo.getValueType();
MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
}
assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
@@ -36882,6 +38777,25 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
Movmsk = DAG.getBitcast(MovmskVT, Match);
} else {
+ // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
+ // PCMPEQQ (SSE41+), use PCMPEQD instead.
+ if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
+ Match.getOpcode() == ISD::SETCC &&
+ ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
+ cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
+ ISD::CondCode::SETEQ) {
+ SDValue Vec = Match.getOperand(0);
+ if (Vec.getValueType().getScalarType() == MVT::i64 &&
+ (2 * NumElts) <= MaxElts) {
+ NumElts *= 2;
+ EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+ Match = DAG.getSetCC(
+ DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
+ DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
+ }
+ }
+
// Use combineBitcastvxi1 to create the MOVMSK.
while (NumElts > MaxElts) {
SDValue Lo, Hi;
@@ -36896,10 +38810,7 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
return SDValue();
Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
} else {
- // Bail with AVX512VL (which uses predicate registers).
- if (Subtarget.hasVLX())
- return SDValue();
-
+ // FIXME: Better handling of k-registers or 512-bit vectors?
unsigned MatchSizeInBits = Match.getValueSizeInBits();
if (!(MatchSizeInBits == 128 ||
(MatchSizeInBits == 256 && Subtarget.hasAVX())))
@@ -36976,21 +38887,14 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
if (!Subtarget.hasSSE2())
return SDValue();
- // Verify the type we're extracting from is any integer type above i16.
- EVT VT = Extract->getOperand(0).getValueType();
- if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
+ EVT ExtractVT = Extract->getValueType(0);
+ // Verify the type we're extracting is either i32 or i64.
+ // FIXME: Could support other types, but this is what we have coverage for.
+ if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
return SDValue();
- unsigned RegSize = 128;
- if (Subtarget.useBWIRegs())
- RegSize = 512;
- else if (Subtarget.hasAVX())
- RegSize = 256;
-
- // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
- // TODO: We should be able to handle larger vectors by splitting them before
- // feeding them into several SADs, and then reducing over those.
- if (RegSize / VT.getVectorNumElements() < 8)
+ EVT VT = Extract->getOperand(0).getValueType();
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
// Match shuffle + add pyramid.
@@ -37006,8 +38910,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// (extends the sign bit which is zero).
// So it is correct to skip the sign/zero extend instruction.
if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
- Root.getOpcode() == ISD::ZERO_EXTEND ||
- Root.getOpcode() == ISD::ANY_EXTEND))
+ Root.getOpcode() == ISD::ZERO_EXTEND ||
+ Root.getOpcode() == ISD::ANY_EXTEND))
Root = Root.getOperand(0);
// If there was a match, we want Root to be a select that is the root of an
@@ -37027,7 +38931,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// If the original vector was wider than 8 elements, sum over the results
// in the SAD vector.
unsigned Stages = Log2_32(VT.getVectorNumElements());
- MVT SadVT = SAD.getSimpleValueType();
+ EVT SadVT = SAD.getValueType();
if (Stages > 3) {
unsigned SadElems = SadVT.getVectorNumElements();
@@ -37042,12 +38946,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
}
}
- MVT Type = Extract->getSimpleValueType(0);
- unsigned TypeSizeInBits = Type.getSizeInBits();
- // Return the lowest TypeSizeInBits bits.
- MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
+ unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
+ // Return the lowest ExtractSizeInBits bits.
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
+ SadVT.getSizeInBits() / ExtractSizeInBits);
SAD = DAG.getBitcast(ResVT, SAD);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
Extract->getOperand(1));
}
@@ -37066,19 +38970,34 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT SrcVT = Src.getValueType();
EVT SrcSVT = SrcVT.getVectorElementType();
+ unsigned SrcEltBits = SrcSVT.getSizeInBits();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
// Don't attempt this for boolean mask vectors or unknown extraction indices.
if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
return SDValue();
+ const APInt &IdxC = N->getConstantOperandAPInt(1);
+ if (IdxC.uge(NumSrcElts))
+ return SDValue();
+
SDValue SrcBC = peekThroughBitcasts(Src);
- // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
+ // Handle extract(bitcast(broadcast(scalar_value))).
if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
SDValue SrcOp = SrcBC.getOperand(0);
- if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
- return DAG.getBitcast(VT, SrcOp);
+ EVT SrcOpVT = SrcOp.getValueType();
+ if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
+ (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
+ unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
+ unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
+ // TODO support non-zero offsets.
+ if (Offset == 0) {
+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
+ SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
+ return SrcOp;
+ }
+ }
}
// If we're extracting a single element from a broadcast load and there are
@@ -37087,22 +39006,43 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
- VT.getSizeInBits() == SrcBCWidth) {
+ VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
MemIntr->getBasePtr(),
MemIntr->getPointerInfo(),
- MemIntr->getAlignment(),
+ MemIntr->getOriginalAlign(),
MemIntr->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
return Load;
}
}
+ // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
+ // TODO: Move to DAGCombine?
+ if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
+ SrcBC.getValueType().isInteger() &&
+ (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
+ SrcBC.getScalarValueSizeInBits() ==
+ SrcBC.getOperand(0).getValueSizeInBits()) {
+ unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
+ if (IdxC.ult(Scale)) {
+ unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
+ SDValue Scl = SrcBC.getOperand(0);
+ EVT SclVT = Scl.getValueType();
+ if (Offset) {
+ Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
+ DAG.getShiftAmountConstant(Offset, SclVT, dl));
+ }
+ Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
+ Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
+ return Scl;
+ }
+ }
+
// Handle extract(truncate(x)) for 0'th index.
// TODO: Treat this as a faux shuffle?
// TODO: When can we use this for general indices?
- if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
- isNullConstant(Idx)) {
+ if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
Src = DAG.getBitcast(SrcVT, Src);
return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
@@ -37114,12 +39054,18 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
return SDValue();
+ // Shuffle inputs must be the same size as the result.
+ if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
+ return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
+ }))
+ return SDValue();
+
// Attempt to narrow/widen the shuffle mask to the correct size.
if (Mask.size() != NumSrcElts) {
if ((NumSrcElts % Mask.size()) == 0) {
SmallVector<int, 16> ScaledMask;
int Scale = NumSrcElts / Mask.size();
- scaleShuffleMask<int>(Scale, Mask, ScaledMask);
+ narrowShuffleMaskElts(Scale, Mask, ScaledMask);
Mask = std::move(ScaledMask);
} else if ((Mask.size() % NumSrcElts) == 0) {
// Simplify Mask based on demanded element.
@@ -37144,7 +39090,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if (Mask.size() != NumSrcElts)
return SDValue();
- int SrcIdx = Mask[N->getConstantOperandVal(1)];
+ int SrcIdx = Mask[IdxC.getZExtValue()];
// If the shuffle source element is undef/zero then we can just accept it.
if (SrcIdx == SM_SentinelUndef)
@@ -37171,8 +39117,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
- assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
- "Unexpected extraction type");
+ assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
@@ -37342,12 +39287,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
// vXi8 reduction - sum lo/hi halves then use PSADBW.
if (VT == MVT::i8) {
while (Rdx.getValueSizeInBits() > 128) {
- unsigned HalfSize = VecVT.getSizeInBits() / 2;
- unsigned HalfElts = VecVT.getVectorNumElements() / 2;
- SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
- SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
- Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
- VecVT = Rdx.getValueType();
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+ VecVT = Lo.getValueType();
+ Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
}
assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
@@ -37362,8 +39305,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
}
// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
- if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+ if (!shouldUseHorizontalOp(true, DAG, Subtarget))
return SDValue();
unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
@@ -37495,11 +39437,21 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
// Attempt to extract a i1 element by using MOVMSK to extract the signbits
// and then testing the relevant element.
+ //
+ // Note that we only combine extracts on the *same* result number, i.e.
+ // t0 = merge_values a0, a1, a2, a3
+ // i1 = extract_vector_elt t0, Constant:i64<2>
+ // i1 = extract_vector_elt t0, Constant:i64<3>
+ // but not
+ // i1 = extract_vector_elt t0:1, Constant:i64<2>
+ // since the latter would need its own MOVMSK.
if (CIdx && SrcVT.getScalarType() == MVT::i1) {
SmallVector<SDNode *, 16> BoolExtracts;
- auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
+ unsigned ResNo = InputVector.getResNo();
+ auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(Use->getOperand(1)) &&
+ Use->getOperand(0).getResNo() == ResNo &&
Use->getValueType(0) == MVT::i1) {
BoolExtracts.push_back(Use);
return true;
@@ -37548,8 +39500,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- // Check if the first operand is all zeros and Cond type is vXi1.
- // This situation only applies to avx512.
// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
// TODO: Can we assert that both operands are not zeros (because that should
// get simplified at node creation time)?
@@ -37564,14 +39514,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return DAG.getConstant(0, DL, VT);
}
- if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
- Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
- // Invert the cond to not(cond) : xor(op,allones)=not(op)
- SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
- // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
- return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
- }
-
// To use the condition operand as a bitwise mask, it must have elements that
// are the same size as the select elements. Ie, the condition operand must
// have already been promoted from the IR select condition type <N x i1>.
@@ -37796,12 +39738,13 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
return true;
};
+ APInt DemandedBits(APInt::getSignMask(BitWidth));
+
if (OnlyUsedAsSelectCond(Cond)) {
- APInt DemandedMask(APInt::getSignMask(BitWidth));
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
- if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
+ if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
return SDValue();
// If we changed the computation somewhere in the DAG, this change will
@@ -37823,15 +39766,9 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
}
// Otherwise we can still at least try to simplify multiple use bits.
- APInt DemandedMask(APInt::getSignMask(BitWidth));
- APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));
- KnownBits Known;
- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
- !DCI.isBeforeLegalizeOps());
- if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,
- DemandedElts, DAG, 0))
- return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
- V, N->getOperand(1), N->getOperand(2));
+ if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
+ return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
+ N->getOperand(1), N->getOperand(2));
return SDValue();
}
@@ -38315,6 +40252,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
}
}
+ // Check if the first operand is all zeros and Cond type is vXi1.
+ // If this an avx512 target we can improve the use of zero masking by
+ // swapping the operands and inverting the condition.
+ if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
+ Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+ ISD::isBuildVectorAllZeros(LHS.getNode()) &&
+ !ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ // Invert the cond to not(cond) : xor(op,allones)=not(op)
+ SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
+ // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+ return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
+ }
+
// Early exit check
if (!TLI.isTypeLegal(VT))
return SDValue();
@@ -38334,12 +40284,86 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(N->getOpcode(), DL, VT,
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
- // Custom action for SELECT MMX
- if (VT == MVT::x86mmx) {
- LHS = DAG.getBitcast(MVT::i64, LHS);
- RHS = DAG.getBitcast(MVT::i64, RHS);
- SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
- return DAG.getBitcast(VT, newSelect);
+ // Try to optimize vXi1 selects if both operands are either all constants or
+ // bitcasts from scalar integer type. In that case we can convert the operands
+ // to integer and use an integer select which will be converted to a CMOV.
+ // We need to take a little bit of care to avoid creating an i64 type after
+ // type legalization.
+ if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
+ VT.getVectorElementType() == MVT::i1 &&
+ (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
+ bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
+ bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
+
+ if ((LHSIsConst ||
+ (LHS.getOpcode() == ISD::BITCAST &&
+ LHS.getOperand(0).getValueType() == IntVT)) &&
+ (RHSIsConst ||
+ (RHS.getOpcode() == ISD::BITCAST &&
+ RHS.getOperand(0).getValueType() == IntVT))) {
+ if (LHSIsConst)
+ LHS = combinevXi1ConstantToInteger(LHS, DAG);
+ else
+ LHS = LHS.getOperand(0);
+
+ if (RHSIsConst)
+ RHS = combinevXi1ConstantToInteger(RHS, DAG);
+ else
+ RHS = RHS.getOperand(0);
+
+ SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
+ return DAG.getBitcast(VT, Select);
+ }
+ }
+
+ // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
+ // single bits, then invert the predicate and swap the select operands.
+ // This can lower using a vector shift bit-hack rather than mask and compare.
+ if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
+ Cond.getOperand(0).getOpcode() == ISD::AND &&
+ isNullOrNullSplat(Cond.getOperand(1)) &&
+ cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+ Cond.getOperand(0).getValueType() == VT) {
+ // The 'and' mask must be composed of power-of-2 constants.
+ SDValue And = Cond.getOperand(0);
+ auto *C = isConstOrConstSplat(And.getOperand(1));
+ if (C && C->getAPIntValue().isPowerOf2()) {
+ // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
+ SDValue NotCond =
+ DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
+ return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
+ }
+
+ // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
+ // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
+ // 16-bit lacks a proper blendv.
+ unsigned EltBitWidth = VT.getScalarSizeInBits();
+ bool CanShiftBlend =
+ TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
+ (Subtarget.hasAVX2() && EltBitWidth == 64) ||
+ (Subtarget.hasXOP()));
+ if (CanShiftBlend &&
+ ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
+ return C->getAPIntValue().isPowerOf2();
+ })) {
+ // Create a left-shift constant to get the mask bits over to the sign-bit.
+ SDValue Mask = And.getOperand(1);
+ SmallVector<int, 32> ShlVals;
+ for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+ auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
+ ShlVals.push_back(EltBitWidth - 1 -
+ MaskVal->getAPIntValue().exactLogBase2());
+ }
+ // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
+ SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
+ SDValue NewCond =
+ DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
+ return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
+ }
}
return SDValue();
@@ -38665,6 +40689,282 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
return SDValue();
}
+/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
+/// to avoid the inversion.
+static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
+ if (EFLAGS.getOpcode() != X86ISD::PTEST &&
+ EFLAGS.getOpcode() != X86ISD::TESTP)
+ return SDValue();
+
+ // PTEST/TESTP sets EFLAGS as:
+ // TESTZ: ZF = (Op0 & Op1) == 0
+ // TESTC: CF = (~Op0 & Op1) == 0
+ // TESTNZC: ZF == 0 && CF == 0
+ EVT VT = EFLAGS.getValueType();
+ SDValue Op0 = EFLAGS.getOperand(0);
+ SDValue Op1 = EFLAGS.getOperand(1);
+ EVT OpVT = Op0.getValueType();
+
+ // TEST*(~X,Y) == TEST*(X,Y)
+ if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
+ X86::CondCode InvCC;
+ switch (CC) {
+ case X86::COND_B:
+ // testc -> testz.
+ InvCC = X86::COND_E;
+ break;
+ case X86::COND_AE:
+ // !testc -> !testz.
+ InvCC = X86::COND_NE;
+ break;
+ case X86::COND_E:
+ // testz -> testc.
+ InvCC = X86::COND_B;
+ break;
+ case X86::COND_NE:
+ // !testz -> !testc.
+ InvCC = X86::COND_AE;
+ break;
+ case X86::COND_A:
+ case X86::COND_BE:
+ // testnzc -> testnzc (no change).
+ InvCC = CC;
+ break;
+ default:
+ InvCC = X86::COND_INVALID;
+ break;
+ }
+
+ if (InvCC != X86::COND_INVALID) {
+ CC = InvCC;
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, NotOp0), Op1);
+ }
+ }
+
+ if (CC == X86::COND_E || CC == X86::COND_NE) {
+ // TESTZ(X,~Y) == TESTC(Y,X)
+ if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, NotOp1), Op0);
+ }
+
+ if (Op0 == Op1) {
+ SDValue BC = peekThroughBitcasts(Op0);
+ EVT BCVT = BC.getValueType();
+ assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
+ "Unexpected vector type");
+
+ // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
+ if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, BC.getOperand(0)),
+ DAG.getBitcast(OpVT, BC.getOperand(1)));
+ }
+
+ // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
+ if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
+ CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+ DAG.getBitcast(OpVT, BC.getOperand(0)),
+ DAG.getBitcast(OpVT, BC.getOperand(1)));
+ }
+
+ // If every element is an all-sign value, see if we can use MOVMSK to
+ // more efficiently extract the sign bits and compare that.
+ // TODO: Handle TESTC with comparison inversion.
+ // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
+ // MOVMSK combines to make sure its never worse than PTEST?
+ unsigned EltBits = BCVT.getScalarSizeInBits();
+ if (DAG.ComputeNumSignBits(BC) == EltBits) {
+ assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
+ APInt SignMask = APInt::getSignMask(EltBits);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (SDValue Res =
+ TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
+ // For vXi16 cases we need to use pmovmksb and extract every other
+ // sign bit.
+ SDLoc DL(EFLAGS);
+ if (EltBits == 16) {
+ MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
+ Res = DAG.getBitcast(MovmskVT, Res);
+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+ Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+ } else {
+ Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
+ DAG.getConstant(0, DL, MVT::i32));
+ }
+ }
+ }
+
+ // TESTZ(-1,X) == TESTZ(X,X)
+ if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
+
+ // TESTZ(X,-1) == TESTZ(X,X)
+ if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+ return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
+ }
+
+ return SDValue();
+}
+
+// Attempt to simplify the MOVMSK input based on the comparison type.
+static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Handle eq/ne against zero (any_of).
+ // Handle eq/ne against -1 (all_of).
+ if (!(CC == X86::COND_E || CC == X86::COND_NE))
+ return SDValue();
+ if (EFLAGS.getValueType() != MVT::i32)
+ return SDValue();
+ unsigned CmpOpcode = EFLAGS.getOpcode();
+ if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
+ return SDValue();
+ auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
+ if (!CmpConstant)
+ return SDValue();
+ const APInt &CmpVal = CmpConstant->getAPIntValue();
+
+ SDValue CmpOp = EFLAGS.getOperand(0);
+ unsigned CmpBits = CmpOp.getValueSizeInBits();
+ assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
+
+ // Peek through any truncate.
+ if (CmpOp.getOpcode() == ISD::TRUNCATE)
+ CmpOp = CmpOp.getOperand(0);
+
+ // Bail if we don't find a MOVMSK.
+ if (CmpOp.getOpcode() != X86ISD::MOVMSK)
+ return SDValue();
+
+ SDValue Vec = CmpOp.getOperand(0);
+ MVT VecVT = Vec.getSimpleValueType();
+ assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
+ "Unexpected MOVMSK operand");
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned NumEltBits = VecVT.getScalarSizeInBits();
+
+ bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
+ bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
+ CmpVal.isMask(NumElts);
+ if (!IsAnyOf && !IsAllOf)
+ return SDValue();
+
+ // See if we can peek through to a vector with a wider element type, if the
+ // signbits extend down to all the sub-elements as well.
+ // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
+ // potential SimplifyDemandedBits/Elts cases.
+ if (Vec.getOpcode() == ISD::BITCAST) {
+ SDValue BC = peekThroughBitcasts(Vec);
+ MVT BCVT = BC.getSimpleValueType();
+ unsigned BCNumElts = BCVT.getVectorNumElements();
+ unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
+ if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
+ BCNumEltBits > NumEltBits &&
+ DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
+ SDLoc DL(EFLAGS);
+ unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+ DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
+ DAG.getConstant(CmpMask, DL, MVT::i32));
+ }
+ }
+
+ // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
+ // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
+ if (IsAllOf && Subtarget.hasSSE41()) {
+ SDValue BC = peekThroughBitcasts(Vec);
+ if (BC.getOpcode() == X86ISD::PCMPEQ &&
+ ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
+ MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
+ return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+ }
+ }
+
+ // See if we can avoid a PACKSS by calling MOVMSK on the sources.
+ // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
+ // sign bits prior to the comparison with zero unless we know that
+ // the vXi16 splats the sign bit down to the lower i8 half.
+ // TODO: Handle all_of patterns.
+ if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
+ SDValue VecOp0 = Vec.getOperand(0);
+ SDValue VecOp1 = Vec.getOperand(1);
+ bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
+ bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
+ // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
+ if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
+ if (!SignExt0) {
+ Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
+ DAG.getConstant(0xAAAA, DL, MVT::i16));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(0, DL, MVT::i16));
+ }
+ // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
+ // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
+ if (CmpBits == 16 && Subtarget.hasInt256() &&
+ VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
+ VecOp0.getConstantOperandAPInt(1) == 0 &&
+ VecOp1.getConstantOperandAPInt(1) == 8 &&
+ (IsAnyOf || (SignExt0 && SignExt1))) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
+ if (!SignExt0 || !SignExt1) {
+ assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
+ Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+ DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+ }
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(CmpMask, DL, MVT::i32));
+ }
+ }
+
+ // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
+ SmallVector<int, 32> ShuffleMask;
+ SmallVector<SDValue, 2> ShuffleInputs;
+ if (NumElts == CmpBits &&
+ getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
+ ShuffleMask, DAG) &&
+ ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
+ ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
+ unsigned NumShuffleElts = ShuffleMask.size();
+ APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
+ for (int M : ShuffleMask) {
+ assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
+ DemandedElts.setBit(M);
+ }
+ if (DemandedElts.isAllOnesValue()) {
+ SDLoc DL(EFLAGS);
+ SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ Result =
+ DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ EFLAGS.getOperand(1));
+ }
+ }
+
+ return SDValue();
+}
+
/// Optimize an EFLAGS definition used according to the condition code \p CC
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
/// uses of chain values.
@@ -38677,6 +40977,13 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
return R;
+
+ if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
+ return R;
+
return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
}
@@ -38698,7 +41005,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
- if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+ if (!(FalseOp.getValueType() == MVT::f80 ||
+ (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
+ (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
+ !Subtarget.hasCMov() || hasFPCMov(CC)) {
SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
Flags};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
@@ -39007,7 +41317,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
: ISD::SIGN_EXTEND,
DL, VT, MulLo);
- MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
+ EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
// the higher part is also needed.
SDValue MulHi =
@@ -39138,10 +41448,14 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
return SDValue();
- // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
- // Also allow v2i32 if it will be widened.
+ // Make sure the type is legal or will be widened to a legal type.
+ if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
- if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+
+ // Without BWI, we would need to split v32i16.
+ if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
return SDValue();
SDValue N0 = N->getOperand(0);
@@ -39358,6 +41672,64 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
return NewMul;
}
+// Try to form a MULHU or MULHS node by looking for
+// (srl (mul ext, ext), 16)
+// TODO: This is X86 specific because we want to be able to handle wide types
+// before type legalization. But we can only do it if the vector will be
+// legalized via widening/splitting. Type legalization can't handle promotion
+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
+// combiner.
+static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+ "SRL or SRA node is required here!");
+ SDLoc DL(N);
+
+ // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
+ // the multiply.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ // The operation feeding into the shift must be a multiply.
+ SDValue ShiftOperand = N->getOperand(0);
+ if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
+ return SDValue();
+
+ // Input type should be at least vXi32.
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
+ return SDValue();
+
+ // Need a shift by 16.
+ APInt ShiftAmt;
+ if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
+ ShiftAmt != 16)
+ return SDValue();
+
+ SDValue LHS = ShiftOperand.getOperand(0);
+ SDValue RHS = ShiftOperand.getOperand(1);
+
+ unsigned ExtOpc = LHS.getOpcode();
+ if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
+ RHS.getOpcode() != ExtOpc)
+ return SDValue();
+
+ // Peek through the extends.
+ LHS = LHS.getOperand(0);
+ RHS = RHS.getOperand(0);
+
+ // Ensure the input types match.
+ EVT MulVT = LHS.getValueType();
+ if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
+ return SDValue();
+
+ unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+ SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
+
+ ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(ExtOpc, DL, VT, Mulh);
+}
+
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -39417,12 +41789,16 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
unsigned Size = VT.getSizeInBits();
+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+ return V;
+
// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
@@ -39471,11 +41847,15 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
}
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
+ if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+ return V;
+
// Only do this on the last DAG combine as it can interfere with other
// combines.
if (!DCI.isAfterLegalizeDAG())
@@ -39519,16 +41899,92 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+ "Unexpected pack opcode");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned NumDstElts = VT.getVectorNumElements();
+
+ // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+ // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+ // truncation trees that help us avoid lane crossing shuffles.
+ // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
+ if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ N0.getConstantOperandAPInt(1) == 0 &&
+ N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
+ N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
+ N0.getOperand(0).getValueType().is256BitVector()) {
+ // TODO - support target/faux shuffles.
+ SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
+ if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
+ // To keep the PACK LHS/RHS coherency, we must be able to scale the unary
+ // shuffle to a vXi64 width - we can probably relax this in the future.
+ SmallVector<int, 4> ShuffleMask;
+ if (SVN->getOperand(1).isUndef() &&
+ scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
+ SDLoc DL(N);
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
+ Lo = DAG.getBitcast(N0.getValueType(), Lo);
+ Hi = DAG.getBitcast(N1.getValueType(), Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
+ Res = DAG.getBitcast(MVT::v4i32, Res);
+ Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ }
+
+ // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
+ // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
+ if (VT.is256BitVector()) {
+ if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
+ if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
+ SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
+ if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
+ scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
+ SDValue Op00 = SVN0->getOperand(0);
+ SDValue Op01 = SVN0->getOperand(1);
+ SDValue Op10 = SVN1->getOperand(0);
+ SDValue Op11 = SVN1->getOperand(1);
+ if ((Op00 == Op11) && (Op01 == Op10)) {
+ std::swap(Op10, Op11);
+ ShuffleVectorSDNode::commuteMask(ShuffleMask1);
+ }
+ if ((Op00 == Op10) && (Op01 == Op11)) {
+ SmallVector<int, 4> ShuffleMask;
+ ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
+ ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
+ SDLoc DL(N);
+ SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
+ Res = DAG.getBitcast(MVT::v4i64, Res);
+ Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
+ return DAG.getBitcast(VT, Res);
+ }
+ }
+ }
+ }
+ }
+
+ return SDValue();
+}
+
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
unsigned Opcode = N->getOpcode();
assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
- "Unexpected shift opcode");
+ "Unexpected pack opcode");
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ unsigned NumDstElts = VT.getVectorNumElements();
unsigned DstBitsPerElt = VT.getScalarSizeInBits();
unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
@@ -39545,7 +42001,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
unsigned NumLanes = VT.getSizeInBits() / 128;
- unsigned NumDstElts = VT.getVectorNumElements();
unsigned NumSrcElts = NumDstElts / 2;
unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
@@ -39592,6 +42047,10 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
}
+ // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
+ if (SDValue V = combineVectorPackWithShuffle(N, DAG))
+ return V;
+
// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
// truncate to create a larger truncate.
if (Subtarget.hasAVX512() &&
@@ -39674,26 +42133,37 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
if (ShiftVal >= NumBitsPerElt) {
if (LogicalShift)
return DAG.getConstant(0, SDLoc(N), VT);
- else
- ShiftVal = NumBitsPerElt - 1;
+ ShiftVal = NumBitsPerElt - 1;
}
- // Shift N0 by zero -> N0.
+ // (shift X, 0) -> X
if (!ShiftVal)
return N0;
- // Shift zero -> zero.
+ // (shift 0, C) -> 0
if (ISD::isBuildVectorAllZeros(N0.getNode()))
+ // N0 is all zeros or undef. We guarantee that the bits shifted into the
+ // result are all zeros, not undef.
return DAG.getConstant(0, SDLoc(N), VT);
- // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
- // clamped to (NumBitsPerElt - 1).
- if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
+ // (VSRAI -1, C) -> -1
+ if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
+ // N0 is all ones or undef. We guarantee that the bits shifted into the
+ // result are all ones, not undef.
+ return DAG.getConstant(-1, SDLoc(N), VT);
+
+ // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
+ if (Opcode == N0.getOpcode()) {
unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
unsigned NewShiftVal = ShiftVal + ShiftVal2;
- if (NewShiftVal >= NumBitsPerElt)
+ if (NewShiftVal >= NumBitsPerElt) {
+ // Out of range logical bit shifts are guaranteed to be zero.
+ // Out of range arithmetic bit shifts splat the sign bit.
+ if (LogicalShift)
+ return DAG.getConstant(0, SDLoc(N), VT);
NewShiftVal = NumBitsPerElt - 1;
- return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
+ }
+ return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
}
@@ -39743,19 +42213,24 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
EVT VT = N->getValueType(0);
assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
- (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
+ (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
+ N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion");
- unsigned NumBitsPerElt = VT.getScalarSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.SimplifyDemandedBits(SDValue(N, 0),
- APInt::getAllOnesValue(NumBitsPerElt), DCI))
- return SDValue(N, 0);
+ if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
+ unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+ APInt::getAllOnesValue(NumBitsPerElt), DCI))
+ return SDValue(N, 0);
+ }
- // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
- SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
- return Res;
+ // Attempt to combine insertion patterns to a shuffle.
+ if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
return SDValue();
}
@@ -39778,7 +42253,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
// The SETCCs should both refer to the same CMP.
- if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+ if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
return SDValue();
SDValue CMP00 = CMP0->getOperand(0);
@@ -39877,10 +42352,27 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (SDValue Not = IsNOT(N0, DAG)) {
+ auto GetNot = [&VT, &DAG](SDValue V) {
+ // Basic X = NOT(Y) detection.
+ if (SDValue Not = IsNOT(V, DAG))
+ return Not;
+ // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
+ if (V.getOpcode() == X86ISD::VBROADCAST) {
+ SDValue Src = V.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isVector())
+ return SDValue();
+ if (SDValue Not = IsNOT(Src, DAG))
+ return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
+ DAG.getBitcast(SrcVT, Not));
+ }
+ return SDValue();
+ };
+
+ if (SDValue Not = GetNot(N0)) {
X = Not;
Y = N1;
- } else if (SDValue Not = IsNOT(N1, DAG)) {
+ } else if (SDValue Not = GetNot(N1)) {
X = Not;
Y = N0;
} else
@@ -39891,6 +42383,65 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
}
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+// or (and (truncate x, truncate y)),
+// (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+// or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants or can
+// be recursively promoted.
+static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
+ unsigned Depth) {
+ // Limit recursion to avoid excessive compile times.
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
+ if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
+ N->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
+ return SDValue();
+
+ if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
+ N0 = NN0;
+ else {
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ if (N0.getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ N0 = N0.getOperand(0);
+ }
+
+ if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
+ N1 = NN1;
+ else {
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+ N1.getOperand(0).getValueType() == VT;
+ if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
+ return SDValue();
+
+ if (RHSTrunc)
+ N1 = N1.getOperand(0);
+ else
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
+ }
+
+ return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
+}
+
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
// register. In most cases we actually compare or select YMM-sized registers
// and mixing the two types creates horrible code. This method optimizes
@@ -39902,6 +42453,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Expected vector type");
+ SDLoc DL(N);
assert((N->getOpcode() == ISD::ANY_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND ||
N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
@@ -39909,57 +42461,33 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Narrow = N->getOperand(0);
EVT NarrowVT = Narrow.getValueType();
- if (Narrow->getOpcode() != ISD::XOR &&
- Narrow->getOpcode() != ISD::AND &&
- Narrow->getOpcode() != ISD::OR)
- return SDValue();
-
- SDValue N0 = Narrow->getOperand(0);
- SDValue N1 = Narrow->getOperand(1);
- SDLoc DL(Narrow);
-
- // The Left side has to be a trunc.
- if (N0.getOpcode() != ISD::TRUNCATE)
- return SDValue();
-
- // The type of the truncated inputs.
- if (N0.getOperand(0).getValueType() != VT)
- return SDValue();
-
- // The right side has to be a 'trunc' or a constant vector.
- bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
- N1.getOperand(0).getValueType() == VT;
- if (!RHSTrunc &&
- !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
- return SDValue();
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
- if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
- return SDValue();
-
- // Set N0 and N1 to hold the inputs to the new wide operation.
- N0 = N0.getOperand(0);
- if (RHSTrunc)
- N1 = N1.getOperand(0);
- else
- N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
-
// Generate the wide operation.
- SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
- unsigned Opcode = N->getOpcode();
- switch (Opcode) {
+ SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
+ if (!Op)
+ return SDValue();
+ switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode");
case ISD::ANY_EXTEND:
return Op;
case ISD::ZERO_EXTEND:
- return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
+ return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
case ISD::SIGN_EXTEND:
return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
Op, DAG.getValueType(NarrowVT));
}
}
+static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
+ unsigned FPOpcode;
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected input node for FP logic conversion");
+ case ISD::AND: FPOpcode = X86ISD::FAND; break;
+ case ISD::OR: FPOpcode = X86ISD::FOR; break;
+ case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+ }
+ return FPOpcode;
+}
+
/// If both input operands of a logic op are being cast from floating point
/// types, try to convert this into a floating point logic node to avoid
/// unnecessary moves from SSE to integer registers.
@@ -39984,18 +42512,45 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
(Subtarget.hasSSE2() && N00Type == MVT::f64)))
return SDValue();
- unsigned FPOpcode;
- switch (N->getOpcode()) {
- default: llvm_unreachable("Unexpected input node for FP logic conversion");
- case ISD::AND: FPOpcode = X86ISD::FAND; break;
- case ISD::OR: FPOpcode = X86ISD::FOR; break;
- case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
- }
-
+ unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
return DAG.getBitcast(VT, FPLogic);
}
+// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
+// to reduce XMM->GPR traffic.
+static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opc = N->getOpcode();
+ assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
+ "Unexpected bit opcode");
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ // Both operands must be single use MOVMSK.
+ if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
+ N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
+ return SDValue();
+
+ SDValue Vec0 = N0.getOperand(0);
+ SDValue Vec1 = N1.getOperand(0);
+ EVT VecVT0 = Vec0.getValueType();
+ EVT VecVT1 = Vec1.getValueType();
+
+ // Both MOVMSK operands must be from vectors of the same size and same element
+ // size, but its OK for a fp/int diff.
+ if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
+ VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
+ return SDValue();
+
+ SDLoc DL(N);
+ unsigned VecOpc =
+ VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
+ SDValue Result =
+ DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
+ return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+}
+
/// If this is a zero/all-bits result that is bitwise-anded with a low bits
/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
/// with a shift-right to eliminate loading the vector constant mask value.
@@ -40318,7 +42873,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
- if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+ SmallVector<APInt, 2> SrcPartials;
+ if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
SrcOps.size() == 1) {
SDLoc dl(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -40328,9 +42884,11 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
- APInt AllBits = APInt::getAllOnesValue(NumElts);
- return DAG.getSetCC(dl, MVT::i1, Mask,
- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
+ assert(SrcPartials[0].getBitWidth() == NumElts &&
+ "Unexpected partial reduction mask");
+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+ return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
}
}
}
@@ -40338,6 +42896,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
return V;
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -40446,6 +43007,16 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
}
SDLoc DL(N);
+
+ if (UseVPTERNLOG) {
+ // Emit a VPTERNLOG node directly.
+ SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
+ SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
+ SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
+ SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
+ return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
+ }
+
SDValue X = N->getOperand(0);
SDValue Y =
DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
@@ -40529,6 +43100,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
if (!Subtarget.hasSSE41())
return SDValue();
+ // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
+ if (Subtarget.hasVLX())
+ return SDValue();
+
MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
X = DAG.getBitcast(BlendVT, X);
@@ -40645,139 +43220,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
return Ret;
}
-static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- EVT VT = N->getValueType(0);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
- if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
- !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
- return SDValue();
-
- // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.shouldOptForSize();
- unsigned Bits = VT.getScalarSizeInBits();
-
- // SHLD/SHRD instructions have lower register pressure, but on some
- // platforms they have higher latency than the equivalent
- // series of shifts/or that would otherwise be generated.
- // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
- // have higher latencies and we are not optimizing for size.
- if (!OptForSize && Subtarget.isSHLDSlow())
- return SDValue();
-
- if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
- std::swap(N0, N1);
- if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
- return SDValue();
- if (!N0.hasOneUse() || !N1.hasOneUse())
- return SDValue();
-
- EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-
- SDValue ShAmt0 = N0.getOperand(1);
- if (ShAmt0.getValueType() != ShiftVT)
- return SDValue();
- SDValue ShAmt1 = N1.getOperand(1);
- if (ShAmt1.getValueType() != ShiftVT)
- return SDValue();
-
- // Peek through any modulo shift masks.
- SDValue ShMsk0;
- if (ShAmt0.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
- ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
- ShMsk0 = ShAmt0;
- ShAmt0 = ShAmt0.getOperand(0);
- }
- SDValue ShMsk1;
- if (ShAmt1.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
- ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
- ShMsk1 = ShAmt1;
- ShAmt1 = ShAmt1.getOperand(0);
- }
-
- if (ShAmt0.getOpcode() == ISD::TRUNCATE)
- ShAmt0 = ShAmt0.getOperand(0);
- if (ShAmt1.getOpcode() == ISD::TRUNCATE)
- ShAmt1 = ShAmt1.getOperand(0);
-
- SDLoc DL(N);
- unsigned Opc = ISD::FSHL;
- SDValue Op0 = N0.getOperand(0);
- SDValue Op1 = N1.getOperand(0);
- if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
- Opc = ISD::FSHR;
- std::swap(Op0, Op1);
- std::swap(ShAmt0, ShAmt1);
- std::swap(ShMsk0, ShMsk1);
- }
-
- auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
- SDValue Amt) {
- if (Opc == ISD::FSHR)
- std::swap(Op0, Op1);
- return DAG.getNode(Opc, DL, VT, Op0, Op1,
- DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
- };
-
- // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
- // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
- // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
- // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
- // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
- // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
- if (ShAmt1.getOpcode() == ISD::SUB) {
- SDValue Sum = ShAmt1.getOperand(0);
- if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
- SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
- if (ShAmt1Op1.getOpcode() == ISD::AND &&
- isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
- ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
- ShMsk1 = ShAmt1Op1;
- ShAmt1Op1 = ShAmt1Op1.getOperand(0);
- }
- if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
- ShAmt1Op1 = ShAmt1Op1.getOperand(0);
- if ((SumC->getAPIntValue() == Bits ||
- (SumC->getAPIntValue() == 0 && ShMsk1)) &&
- ShAmt1Op1 == ShAmt0)
- return GetFunnelShift(Op0, Op1, ShAmt0);
- }
- } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
- auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
- if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
- return GetFunnelShift(Op0, Op1, ShAmt0);
- } else if (ShAmt1.getOpcode() == ISD::XOR) {
- SDValue Mask = ShAmt1.getOperand(1);
- if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
- unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
- SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
- if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
- ShAmt1Op0 = ShAmt1Op0.getOperand(0);
- if (MaskC->getSExtValue() == (Bits - 1) &&
- (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
- if (Op1.getOpcode() == InnerShift &&
- isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getConstantOperandAPInt(1).isOneValue()) {
- return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
- }
- // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
- if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
- Op1.getOperand(0) == Op1.getOperand(1)) {
- return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
- }
- }
- }
- }
-
- return SDValue();
-}
-
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -40797,7 +43239,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
// TODO: Support multiple SrcOps.
if (VT == MVT::i1) {
SmallVector<SDValue, 2> SrcOps;
- if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+ SmallVector<APInt, 2> SrcPartials;
+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
SrcOps.size() == 1) {
SDLoc dl(N);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -40807,13 +43250,19 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
- APInt AllBits = APInt::getNullValue(NumElts);
- return DAG.getSetCC(dl, MVT::i1, Mask,
- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+ assert(SrcPartials[0].getBitWidth() == NumElts &&
+ "Unexpected partial reduction mask");
+ SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
+ SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+ Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+ return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
}
}
}
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -40829,8 +43278,33 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
- if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
- return R;
+ // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
+ // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
+ // iff the upper elements of the non-shifted arg are zero.
+ // KUNPCK require 16+ bool vector elements.
+ if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfElts = NumElts / 2;
+ APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
+ if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
+ N1.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N0, 0, DAG, dl, HalfElts),
+ extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
+ N0.getConstantOperandAPInt(1) == HalfElts &&
+ DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
+ SDLoc dl(N);
+ return DAG.getNode(
+ ISD::CONCAT_VECTORS, dl, VT,
+ extractSubVector(N1, 0, DAG, dl, HalfElts),
+ extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
+ }
+ }
// Attempt to recursively combine an OR of shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
@@ -41179,18 +43653,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
// A lambda checking the given SDValue is a constant vector and each element
// is in the range [Min, Max].
auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
- BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
- if (!BV || !BV->isConstant())
- return false;
- for (SDValue Op : V->ops()) {
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
- if (!C)
- return false;
- const APInt &Val = C->getAPIntValue();
- if (Val.ult(Min) || Val.ugt(Max))
- return false;
- }
- return true;
+ return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
+ return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
+ });
};
// Check if each element of the vector is right-shifted by one.
@@ -41291,10 +43756,10 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
ISD::LoadExtType Ext = Ld->getExtensionType();
bool Fast;
- unsigned Alignment = Ld->getAlignment();
if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
Ext == ISD::NON_EXTLOAD &&
- ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
+ ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
+ Ld->getAlignment() >= 16) ||
(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
*Ld->getMemOperand(), &Fast) &&
!Fast))) {
@@ -41302,17 +43767,18 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
if (NumElems < 2)
return SDValue();
- unsigned HalfAlign = 16;
+ unsigned HalfOffset = 16;
SDValue Ptr1 = Ld->getBasePtr();
- SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
+ SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
NumElems / 2);
SDValue Load1 =
DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
- Alignment, Ld->getMemOperand()->getFlags());
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
- Ld->getPointerInfo().getWithOffset(HalfAlign),
- MinAlign(Alignment, HalfAlign),
+ Ld->getPointerInfo().getWithOffset(HalfOffset),
+ Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
Load1.getValue(1), Load2.getValue(1));
@@ -41329,13 +43795,28 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
if (TLI.isTypeLegal(IntVT)) {
SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Alignment,
+ Ld->getPointerInfo(),
+ Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
}
}
+ // Cast ptr32 and ptr64 pointers to the default address space before a load.
+ unsigned AddrSpace = Ld->getAddressSpace();
+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+ AddrSpace == X86AS::PTR32_UPTR) {
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
+ SDValue Cast =
+ DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
+ return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ }
+ }
+
return SDValue();
}
@@ -41482,7 +43963,7 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+ auto *Mld = cast<MaskedLoadSDNode>(N);
// TODO: Expanding load with constant mask may be optimized as well.
if (Mld->isExpandingLoad())
@@ -41491,12 +43972,33 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
return ScalarLoad;
+
// TODO: Do some AVX512 subsets benefit from this transform?
if (!Subtarget.hasAVX512())
if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
return Blend;
}
+ // If the mask value has been legalized to a non-boolean vector, try to
+ // simplify ops leading up to it. We only demand the MSB of each lane.
+ SDValue Mask = Mld->getMask();
+ if (Mask.getScalarValueSizeInBits() != 1) {
+ EVT VT = Mld->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ if (SDValue NewMask =
+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+ return DAG.getMaskedLoad(
+ VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
+ NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
+ Mld->getAddressingMode(), Mld->getExtensionType());
+ }
+
return SDValue();
}
@@ -41548,9 +44050,18 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
// simplify ops leading up to it. We only demand the MSB of each lane.
SDValue Mask = Mst->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
- APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
return SDValue(N, 0);
+ }
+ if (SDValue NewMask =
+ TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+ return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
+ Mst->getBasePtr(), Mst->getOffset(), NewMask,
+ Mst->getMemoryVT(), Mst->getMemOperand(),
+ Mst->getAddressingMode());
}
SDValue Value = Mst->getValue();
@@ -41572,7 +44083,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoreSDNode *St = cast<StoreSDNode>(N);
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
- unsigned Alignment = St->getAlignment();
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41585,7 +44095,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal = DAG.getBitcast(NewVT, StoredVal);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -41596,7 +44106,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
StoredVal.getOperand(0).getValueType() == MVT::i8) {
return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
St->getBasePtr(), St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags());
+ St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
}
// Widen v2i1/v4i1 stores to v8i1.
@@ -41607,7 +44118,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
Ops[0] = StoredVal;
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -41616,7 +44127,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
// If its a v64i1 store without 64-bit support, we need two stores.
- if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
StoredVal->ops().slice(0, 32));
Lo = combinevXi1ConstantToInteger(Lo, DAG);
@@ -41629,18 +44140,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
SDValue Ch0 =
DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
- Alignment, St->getMemOperand()->getFlags());
+ St->getOriginalAlign(),
+ St->getMemOperand()->getFlags());
SDValue Ch1 =
DAG.getStore(St->getChain(), dl, Hi, Ptr1,
St->getPointerInfo().getWithOffset(4),
- MinAlign(Alignment, 4U),
+ St->getOriginalAlign(),
St->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
}
StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
@@ -41659,7 +44171,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
// Split under-aligned vector non-temporal stores.
- if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+ if (St->isNonTemporal() && StVT == VT &&
+ St->getAlignment() < VT.getStoreSize()) {
// ZMM/YMM nt-stores - either it can be stored as a series of shorter
// vectors or the legalizer can scalarize it to use MOVNTI.
if (VT.is256BitVector() || VT.is512BitVector()) {
@@ -41713,7 +44226,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
Subtarget, dl))
return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
if (TLI.isTruncStoreLegal(VT, StVT)) {
@@ -41731,6 +44244,20 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+ // Cast ptr32 and ptr64 pointers to the default address space before a store.
+ unsigned AddrSpace = St->getAddressSpace();
+ if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+ AddrSpace == X86AS::PTR32_UPTR) {
+ MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+ if (PtrVT != St->getBasePtr().getSimpleValueType()) {
+ SDValue Cast =
+ DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
+ return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
+ St->getPointerInfo(), St->getOriginalAlign(),
+ St->getMemOperand()->getFlags(), St->getAAInfo());
+ }
+ }
+
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
@@ -41785,13 +44312,38 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
BitCast, OldExtract.getOperand(1));
return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
+ St->getPointerInfo(), St->getOriginalAlign(),
St->getMemOperand()->getFlags());
}
return SDValue();
}
+static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ auto *St = cast<MemIntrinsicSDNode>(N);
+
+ SDValue StoredVal = N->getOperand(1);
+ MVT VT = StoredVal.getSimpleValueType();
+ EVT MemVT = St->getMemoryVT();
+
+ // Figure out which elements we demand.
+ unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
+ APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
+
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
+ KnownZero, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
/// Return 'true' if this vector operation is "horizontal"
/// and return the operands for the horizontal operation in LHS and RHS. A
/// horizontal operation performs the binary operation on successive elements
@@ -42028,17 +44580,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
// of one truncation.
// i.e. if one of the inputs will constant fold or the input is repeated.
switch (SrcOpcode) {
- case ISD::AND:
- case ISD::XOR:
- case ISD::OR: {
- SDValue Op0 = Src.getOperand(0);
- SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
- (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
- return TruncateArithmetic(Op0, Op1);
- break;
- }
-
case ISD::MUL:
// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
// better to truncate if we have the chance.
@@ -42047,21 +44588,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
!TLI.isOperationLegal(SrcOpcode, SrcVT))
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
- case ISD::ADD: {
- SDValue Op0 = Src.getOperand(0);
- SDValue Op1 = Src.getOperand(1);
- if (TLI.isOperationLegal(SrcOpcode, VT) &&
- (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
- return TruncateArithmetic(Op0, Op1);
- break;
- }
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR:
+ case ISD::ADD:
case ISD::SUB: {
- // TODO: ISD::SUB We are conservative and require both sides to be freely
- // truncatable to avoid interfering with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(SrcOpcode, VT) &&
- (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
+ (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
return TruncateArithmetic(Op0, Op1);
break;
}
@@ -42172,13 +44707,17 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
MVT InSVT = InVT.getScalarType();
// Check we have a truncation suited for PACKSS/PACKUS.
- if (!VT.is128BitVector() && !VT.is256BitVector())
+ if (!isPowerOf2_32(VT.getVectorNumElements()))
return SDValue();
if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
return SDValue();
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
+ // Truncation to sub-128bit vXi32 can be better handled with shuffles.
+ if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
+ return SDValue();
+
// AVX512 has fast truncate, but if the input is already going to be split,
// there's no harm in trying pack.
if (Subtarget.hasAVX512() &&
@@ -42199,6 +44738,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
// Use PACKSS if the input has sign-bits that extend all the way to the
// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+
+ // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
+ // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
+ // on and combines/simplifications can't then use it.
+ if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
+ return SDValue();
+
if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
@@ -42227,9 +44773,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
return SDValue();
- // Input type should be vXi32.
+ // Input type should be at least vXi32.
EVT InVT = Src.getValueType();
- if (InVT.getVectorElementType() != MVT::i32)
+ if (InVT.getVectorElementType().getSizeInBits() < 32)
return SDValue();
// Need a shift by 16.
@@ -42438,7 +44984,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return combineVectorTruncation(N, DAG, Subtarget);
}
-static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
SDValue In = N->getOperand(0);
SDLoc DL(N);
@@ -42448,6 +44995,11 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+ return SDValue(N, 0);
+
return SDValue();
}
@@ -42540,37 +45092,46 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
if (NegMul) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMADD: Opcode = ISD::FMA; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case ISD::FMA: Opcode = X86ISD::FNMADD; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMADD: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
}
}
if (NegAcc) {
switch (Opcode) {
default: llvm_unreachable("Unexpected opcode");
- case ISD::FMA: Opcode = X86ISD::FMSUB; break;
- case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
- case X86ISD::FMSUB: Opcode = ISD::FMA; break;
- case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
- case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
- case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
- case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
- case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
- case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
- case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
- case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
- case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+ case ISD::FMA: Opcode = X86ISD::FMSUB; break;
+ case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
+ case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
+ case X86ISD::FMSUB: Opcode = ISD::FMA; break;
+ case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
+ case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
+ case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
+ case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
+ case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
+ case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
+ case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
+ case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
+ case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
+ case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
+ case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
+ case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
}
}
if (NegRes) {
switch (Opcode) {
+ // For accuracy reason, we never combine fneg and fma under strict FP.
default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
@@ -42588,18 +45149,20 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
/// Do target-specific dag combines on floating point negations.
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
EVT OrigVT = N->getValueType(0);
SDValue Arg = isFNEG(DAG, N);
if (!Arg)
return SDValue();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT VT = Arg.getValueType();
EVT SVT = VT.getScalarType();
SDLoc DL(N);
// Let legalize expand this if it isn't a legal type yet.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ if (!TLI.isTypeLegal(VT))
return SDValue();
// If we're negating a FMUL node on a target with FMA, then we can avoid the
@@ -42613,80 +45176,25 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(OrigVT, NewNode);
}
- // If we're negating an FMA node, then we can adjust the
- // instruction to include the extra negation.
- if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
- switch (Arg.getOpcode()) {
- case ISD::FMA:
- case X86ISD::FMSUB:
- case X86ISD::FNMADD:
- case X86ISD::FNMSUB:
- case X86ISD::FMADD_RND:
- case X86ISD::FMSUB_RND:
- case X86ISD::FNMADD_RND:
- case X86ISD::FNMSUB_RND: {
- // We can't handle scalar intrinsic node here because it would only
- // invert one element and not the whole vector. But we could try to handle
- // a negation of the lower element only.
- unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
- return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
- }
- }
- }
+ bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool LegalOperations = !DCI.isBeforeLegalizeOps();
+ if (SDValue NegArg =
+ TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
+ return DAG.getBitcast(OrigVT, NegArg);
return SDValue();
}
-char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
- bool LegalOperations,
- bool ForCodeSize,
- unsigned Depth) const {
- // fneg patterns are removable even if they have multiple uses.
- if (isFNEG(DAG, Op.getNode(), Depth))
- return 2;
-
- // Don't recurse exponentially.
- if (Depth > SelectionDAG::MaxRecursionDepth)
- return 0;
-
- EVT VT = Op.getValueType();
- EVT SVT = VT.getScalarType();
- switch (Op.getOpcode()) {
- case ISD::FMA:
- case X86ISD::FMSUB:
- case X86ISD::FNMADD:
- case X86ISD::FNMSUB:
- case X86ISD::FMADD_RND:
- case X86ISD::FMSUB_RND:
- case X86ISD::FNMADD_RND:
- case X86ISD::FNMSUB_RND: {
- if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
- !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
- break;
-
- // This is always negatible for free but we might be able to remove some
- // extra operand negations as well.
- for (int i = 0; i != 3; ++i) {
- char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
- ForCodeSize, Depth + 1);
- if (V == 2)
- return V;
- }
- return 1;
- }
- }
-
- return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
- ForCodeSize, Depth);
-}
-
SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
bool LegalOperations,
bool ForCodeSize,
+ NegatibleCost &Cost,
unsigned Depth) const {
// fneg patterns are removable even if they have multiple uses.
- if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
+ if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
+ Cost = NegatibleCost::Cheaper;
return DAG.getBitcast(Op.getValueType(), Arg);
+ }
EVT VT = Op.getValueType();
EVT SVT = VT.getScalarType();
@@ -42701,35 +45209,41 @@ SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB_RND: {
if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
- !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+ !(SVT == MVT::f32 || SVT == MVT::f64) ||
+ !isOperationLegal(ISD::FMA, VT))
break;
// This is always negatible for free but we might be able to remove some
// extra operand negations as well.
SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
- for (int i = 0; i != 3; ++i) {
- char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
- ForCodeSize, Depth + 1);
- if (V == 2)
- NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
- ForCodeSize, Depth + 1);
- }
+ for (int i = 0; i != 3; ++i)
+ NewOps[i] = getCheaperNegatedExpression(
+ Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
bool NegA = !!NewOps[0];
bool NegB = !!NewOps[1];
bool NegC = !!NewOps[2];
unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
+ Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
+ : NegatibleCost::Neutral;
+
// Fill in the non-negated ops with the original values.
for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
if (!NewOps[i])
NewOps[i] = Op.getOperand(i);
return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
}
+ case X86ISD::FRCP:
+ if (SDValue NegOp0 =
+ getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
+ ForCodeSize, Cost, Depth + 1))
+ return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
+ break;
}
return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
- ForCodeSize, Depth);
+ ForCodeSize, Cost, Depth);
}
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
@@ -42790,6 +45304,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
+ if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+ return R;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -42802,33 +45319,21 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
- return combineFneg(N, DAG, Subtarget);
+ return combineFneg(N, DAG, DCI, Subtarget);
}
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
EVT VT = N->getValueType(0);
unsigned NumBits = VT.getSizeInBits();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
// TODO - Constant Folding.
- if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
- // Reduce Cst1 to the bottom 16-bits.
- // NOTE: SimplifyDemandedBits won't do this for constants.
- const APInt &Val1 = Cst1->getAPIntValue();
- APInt MaskedVal1 = Val1 & 0xFFFF;
- if (MaskedVal1 != Val1)
- return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
- DAG.getConstant(MaskedVal1, SDLoc(N), VT));
- }
-
- // Only bottom 16-bits of the control bits are required.
- APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
- if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
+
+ // Simplify the inputs.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedMask(APInt::getAllOnesValue(NumBits));
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
return SDValue(N, 0);
return SDValue();
@@ -42919,6 +45424,7 @@ static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
@@ -42930,7 +45436,7 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
if (isNullFPScalarOrVectorConst(N->getOperand(1)))
return N->getOperand(0);
- if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
+ if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
return NewVal;
return lowerX86FPLogicOp(N, DAG, Subtarget);
@@ -43041,23 +45547,16 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
- // Unless the load is volatile or atomic.
- if (LN->isSimple()) {
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getIntegerVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
- unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
- MVT MemVT = MVT::getIntegerVT(NumBits);
- MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
- SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
- SDValue VZLoad =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
- LN->getPointerInfo(),
- LN->getAlignment(),
- LN->getMemOperand()->getFlags());
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
DAG.getBitcast(InVT, VZLoad));
DCI.CombineTo(N, Convert);
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
return SDValue(N, 0);
}
}
@@ -43067,33 +45566,33 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
- // FIXME: Handle strict fp nodes.
+ bool IsStrict = N->isTargetStrictFPOpcode();
EVT VT = N->getValueType(0);
// Convert a full vector load into vzload when not all bits are needed.
- SDValue In = N->getOperand(0);
+ SDValue In = N->getOperand(IsStrict ? 1 : 0);
MVT InVT = In.getSimpleValueType();
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
LoadSDNode *LN = cast<LoadSDNode>(In);
- // Unless the load is volatile or atomic.
- if (LN->isSimple()) {
+ unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+ MVT MemVT = MVT::getFloatingPointVT(NumBits);
+ MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
SDLoc dl(N);
- unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
- MVT MemVT = MVT::getFloatingPointVT(NumBits);
- MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
- SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
- SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
- SDValue VZLoad =
- DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
- LN->getPointerInfo(),
- LN->getAlignment(),
- LN->getMemOperand()->getFlags());
- SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
- DAG.getBitcast(InVT, VZLoad));
- DCI.CombineTo(N, Convert);
+ if (IsStrict) {
+ SDValue Convert =
+ DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
+ {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
+ DCI.CombineTo(N, Convert, Convert.getValue(1));
+ } else {
+ SDValue Convert =
+ DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
+ DCI.CombineTo(N, Convert);
+ }
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
return SDValue(N, 0);
}
}
@@ -43132,14 +45631,58 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
- SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// BT ignores high bits in the bit index operand.
unsigned BitWidth = N1.getValueSizeInBits();
APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
- if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
- return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
+ if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+
+ if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
+ APInt KnownUndef, KnownZero;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ APInt DemandedElts = APInt::getLowBitsSet(8, 4);
+ if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+ DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+
+ // Convert a full vector load into vzload when not all bits are needed.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
+ if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
+ SDLoc dl(N);
+ if (IsStrict) {
+ SDValue Convert = DAG.getNode(
+ N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
+ DCI.CombineTo(N, Convert, Convert.getValue(1));
+ } else {
+ SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
+ DAG.getBitcast(MVT::v8i16, VZLoad));
+ DCI.CombineTo(N, Convert);
+ }
+
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+ DCI.recursivelyDeleteUnusedNodes(LN);
+ return SDValue(N, 0);
+ }
+ }
+ }
return SDValue();
}
@@ -43225,7 +45768,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
- N0.getOpcode() == ISD::SIGN_EXTEND)) {
+ N0.getOpcode() == ISD::SIGN_EXTEND)) {
SDValue N00 = N0.getOperand(0);
// EXTLOAD has a better solution on AVX2,
@@ -43234,9 +45777,14 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
if (!ISD::isNormalLoad(N00.getNode()))
return SDValue();
+ // Attempt to promote any comparison mask ops before moving the
+ // SIGN_EXTEND_INREG in the way.
+ if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
+
if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
- SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
- N00, N1);
+ SDValue Tmp =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
}
}
@@ -43421,6 +45969,21 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
for (unsigned i = 0; i != Scale; ++i)
ShuffleMask.append(EltSizeInBits, i);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
+ (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
+ // If we have register broadcast instructions, use the scalar size as the
+ // element type for the shuffle. Then cast to the wider element type. The
+ // widened bits won't be used, and this might allow the use of a broadcast
+ // load.
+ assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
+ unsigned Scale = EltSizeInBits / NumElts;
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ ShuffleMask.append(NumElts * Scale, 0);
+ Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
+ Vec = DAG.getBitcast(VT, Vec);
} else {
// For smaller scalar integers, we can simply any-extend it to the vector
// element size (we don't care about the upper bits) and broadcast it to all
@@ -43428,8 +45991,8 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
ShuffleMask.append(NumElts, 0);
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
}
- Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
// Now, mask the relevant bit in each element.
SmallVector<SDValue, 32> Bits;
@@ -43474,7 +46037,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
// We can only do this if the vector size in 256 bits or less.
unsigned Size = VT.getSizeInBits();
- if (Size > 256)
+ if (Size > 256 && Subtarget.useAVX512Regs())
return SDValue();
// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
@@ -43492,7 +46055,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
if (N->getOpcode() == ISD::ZERO_EXTEND)
- Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
+ Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
return Res;
}
@@ -43505,6 +46068,23 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
EVT InVT = N0.getValueType();
SDLoc DL(N);
+ // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+ if (!DCI.isBeforeLegalizeOps() &&
+ N0.getOpcode() == X86ISD::SETCC_CARRY) {
+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
+ N0->getOperand(1));
+ bool ReplaceOtherUses = !N0.hasOneUse();
+ DCI.CombineTo(N, Setcc);
+ // Replace other uses with a truncate of the widened setcc_carry.
+ if (ReplaceOtherUses) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+ N0.getValueType(), Setcc);
+ DCI.CombineTo(N0.getNode(), Trunc);
+ }
+
+ return SDValue(N, 0);
+ }
+
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
return NewCMov;
@@ -43542,6 +46122,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
+ bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
// Let legalize expand this if it isn't a legal type yet.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -43552,15 +46133,16 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue();
- SDValue A = N->getOperand(0);
- SDValue B = N->getOperand(1);
- SDValue C = N->getOperand(2);
+ SDValue A = N->getOperand(IsStrict ? 1 : 0);
+ SDValue B = N->getOperand(IsStrict ? 2 : 1);
+ SDValue C = N->getOperand(IsStrict ? 3 : 2);
auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
bool LegalOperations = !DCI.isBeforeLegalizeOps();
- if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
- V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
+ if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
+ CodeSize)) {
+ V = NegV;
return true;
}
// Look through extract_vector_elts. If it comes from an FNEG, create a
@@ -43568,11 +46150,10 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isNullConstant(V.getOperand(1))) {
SDValue Vec = V.getOperand(0);
- if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
- SDValue NegVal =
- TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
+ if (SDValue NegV = TLI.getCheaperNegatedExpression(
+ Vec, DAG, LegalOperations, CodeSize)) {
V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
- NegVal, V.getOperand(1));
+ NegV, V.getOperand(1));
return true;
}
}
@@ -43592,9 +46173,15 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
unsigned NewOpcode =
negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
- if (N->getNumOperands() == 4)
- return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
- return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ if (IsStrict) {
+ assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
+ return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
+ {N->getOperand(0), A, B, C});
+ } else {
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ }
}
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
@@ -43608,10 +46195,11 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
bool LegalOperations = !DCI.isBeforeLegalizeOps();
SDValue N2 = N->getOperand(2);
- if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
- return SDValue();
- SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+ SDValue NegN2 =
+ TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+ if (!NegN2)
+ return SDValue();
unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
if (N->getNumOperands() == 4)
@@ -43624,38 +46212,26 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
- // (and (i32 x86isd::setcc_carry), 1)
- // This eliminates the zext. This transformation is necessary because
- // ISD::SETCC is always legalized to i8.
SDLoc dl(N);
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
- if (N0.getOpcode() == ISD::AND &&
- N0.hasOneUse() &&
- N0.getOperand(0).hasOneUse()) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
- if (!isOneConstant(N0.getOperand(1)))
- return SDValue();
- return DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
- N00.getOperand(0), N00.getOperand(1)),
- DAG.getConstant(1, dl, VT));
+ // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+ // FIXME: Is this needed? We don't seem to have any tests for it.
+ if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
+ N0.getOpcode() == X86ISD::SETCC_CARRY) {
+ SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
+ N0->getOperand(1));
+ bool ReplaceOtherUses = !N0.hasOneUse();
+ DCI.CombineTo(N, Setcc);
+ // Replace other uses with a truncate of the widened setcc_carry.
+ if (ReplaceOtherUses) {
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+ N0.getValueType(), Setcc);
+ DCI.CombineTo(N0.getNode(), Trunc);
}
- }
- if (N0.getOpcode() == ISD::TRUNCATE &&
- N0.hasOneUse() &&
- N0.getOperand(0).hasOneUse()) {
- SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
- return DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
- N00.getOperand(0), N00.getOperand(1)),
- DAG.getConstant(1, dl, VT));
- }
+ return SDValue(N, 0);
}
if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
@@ -43768,13 +46344,12 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
- bool HasAVX = Subtarget.hasAVX();
// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && HasAVX) ||
+ (OpSize == 256 && Subtarget.hasAVX()) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
@@ -43828,11 +46403,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
X = DAG.getBitcast(TmpCastVT, X);
if (!NeedZExt && !TmpZext)
return X;
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
DAG.getConstant(0, DL, VecVT), X,
- DAG.getConstant(0, DL, VecIdxVT));
+ DAG.getVectorIdxConstant(0, DL));
};
SDValue Cmp;
@@ -43865,17 +46438,16 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
Cmp);
SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
- SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
- return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
+ SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
}
// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
- // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
- // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+ assert(Cmp.getValueType() == MVT::v16i8 &&
+ "Non 128-bit vector on pre-SSE41 target");
SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
- SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
- MVT::i32);
+ SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
}
@@ -43892,23 +46464,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
if (CC == ISD::SETNE || CC == ISD::SETEQ) {
- // 0-x == y --> x+y == 0
- // 0-x != y --> x+y != 0
- if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
- LHS.hasOneUse()) {
- SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
- return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
- }
- // x == 0-y --> x+y == 0
- // x != 0-y --> x+y != 0
- if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
- RHS.hasOneUse()) {
- SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
- return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
- }
-
if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
return V;
+
+ if (VT == MVT::i1 && isNullConstant(RHS)) {
+ SDValue X86CC;
+ if (SDValue V =
+ MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
+ return DAG.getNode(ISD::TRUNCATE, DL, VT,
+ DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
+ }
}
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
@@ -43931,7 +46496,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (IsSEXT0 && IsVZero1) {
assert(VT == Op0.getOperand(0).getValueType() &&
- "Uexpected operand type");
+ "Unexpected operand type");
if (TmpCC == ISD::SETGT)
return DAG.getConstant(0, DL, VT);
if (TmpCC == ISD::SETLE)
@@ -44021,20 +46586,43 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
return SDValue(N, 0);
+ }
}
return SDValue();
}
+static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
+ SDValue Index, SDValue Base, SDValue Scale,
+ SelectionDAG &DAG) {
+ SDLoc DL(GorS);
+
+ if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+ SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
+ Gather->getMask(), Base, Index, Scale } ;
+ return DAG.getMaskedGather(Gather->getVTList(),
+ Gather->getMemoryVT(), DL, Ops,
+ Gather->getMemOperand(),
+ Gather->getIndexType());
+ }
+ auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+ SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
+ Scatter->getMask(), Base, Index, Scale };
+ return DAG.getMaskedScatter(Scatter->getVTList(),
+ Scatter->getMemoryVT(), DL,
+ Ops, Scatter->getMemOperand(),
+ Scatter->getIndexType());
+}
+
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDLoc DL(N);
auto *GorS = cast<MaskedGatherScatterSDNode>(N);
- SDValue Chain = GorS->getChain();
SDValue Index = GorS->getIndex();
- SDValue Mask = GorS->getMask();
SDValue Base = GorS->getBasePtr();
SDValue Scale = GorS->getScale();
@@ -44054,21 +46642,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
unsigned NumElts = Index.getValueType().getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Chain, Gather->getPassThru(),
- Mask, Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
- Gather->getIndexType());
- }
- auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Chain, Scatter->getValue(),
- Mask, Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
- Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
}
@@ -44083,21 +46657,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
unsigned NumElts = Index.getValueType().getVectorNumElements();
EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Chain, Gather->getPassThru(),
- Mask, Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
- Gather->getIndexType());
- }
- auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Chain, Scatter->getValue(),
- Mask, Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
- Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
}
@@ -44110,30 +46670,20 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
Index.getValueType().getVectorNumElements());
Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
- if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
- SDValue Ops[] = { Chain, Gather->getPassThru(),
- Mask, Base, Index, Scale } ;
- return DAG.getMaskedGather(Gather->getVTList(),
- Gather->getMemoryVT(), DL, Ops,
- Gather->getMemOperand(),
- Gather->getIndexType());
- }
- auto *Scatter = cast<MaskedScatterSDNode>(GorS);
- SDValue Ops[] = { Chain, Scatter->getValue(),
- Mask, Base, Index, Scale };
- return DAG.getMaskedScatter(Scatter->getVTList(),
- Scatter->getMemoryVT(), DL,
- Ops, Scatter->getMemOperand(),
- Scatter->getIndexType());
+ return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
}
}
// With vector masks we only demand the upper bit of the mask.
+ SDValue Mask = GorS->getMask();
if (Mask.getScalarValueSizeInBits() != 1) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
- if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+ if (N->getOpcode() != ISD::DELETED_NODE)
+ DCI.AddToWorklist(N);
return SDValue(N, 0);
+ }
}
return SDValue();
@@ -44172,10 +46722,11 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// TODO: Could we move this to DAGCombine?
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
SelectionDAG &DAG) {
- // Take advantage of vector comparisons producing 0 or -1 in each lane to
- // optimize away operation when it's from a constant.
+ // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
+ // to optimize away operation when it's from a constant.
//
// The general transformation is:
// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
@@ -44187,9 +46738,10 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
// aren't the same.
EVT VT = N->getValueType(0);
bool IsStrict = N->isStrictFPOpcode();
+ unsigned NumEltBits = VT.getScalarSizeInBits();
SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
- if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||
- Op0->getOperand(0)->getOpcode() != ISD::SETCC ||
+ if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
+ DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
VT.getSizeInBits() != Op0.getValueSizeInBits())
return SDValue();
@@ -44362,7 +46914,6 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
Op0.getOpcode() == ISD::LOAD) {
LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
- EVT LdVT = Ld->getValueType(0);
// This transformation is not supported if the result type is f16 or f128.
if (VT == MVT::f16 || VT == MVT::f128)
@@ -44373,11 +46924,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (Subtarget.hasDQI() && VT != MVT::f80)
return SDValue();
- if (Ld->isSimple() && !VT.isVector() &&
- ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
- !Subtarget.is64Bit() && LdVT == MVT::i64) {
- std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(
- SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
+ if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
+ Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
+ std::pair<SDValue, SDValue> Tmp =
+ Subtarget.getTargetLowering()->BuildFILD(
+ VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
return Tmp.first;
}
@@ -44711,7 +47263,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
}
if (CC == X86::COND_A) {
- SDValue EFLAGS = Y->getOperand(1);
+ SDValue EFLAGS = Y.getOperand(1);
// Try to convert COND_A into COND_B in an attempt to facilitate
// materializing "setb reg".
//
@@ -44724,13 +47276,44 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
EFLAGS.getNode()->getVTList(),
EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
DAG.getConstant(0, DL, VT), NewEFLAGS);
}
}
+ if (CC == X86::COND_AE) {
+ // X + SETAE --> sbb X, -1
+ // X - SETAE --> adc X, -1
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), Y.getOperand(1));
+ }
+
+ if (CC == X86::COND_BE) {
+ // X + SETBE --> sbb X, -1
+ // X - SETBE --> adc X, -1
+ SDValue EFLAGS = Y.getOperand(1);
+ // Try to convert COND_BE into COND_AE in an attempt to facilitate
+ // materializing "setae reg".
+ //
+ // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(-1, DL, VT), NewEFLAGS);
+ }
+ }
+
if (CC != X86::COND_E && CC != X86::COND_NE)
return SDValue();
@@ -44767,15 +47350,18 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
SDValue One = DAG.getConstant(1, DL, ZVT);
- SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
- DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+ DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+ Cmp1.getValue(1));
}
}
// (cmp Z, 1) sets the carry flag if Z is 0.
SDValue One = DAG.getConstant(1, DL, ZVT);
- SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+ SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+ SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
// Add the flags type for ADC/SBB nodes.
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
@@ -44784,151 +47370,12 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
if (CC == X86::COND_NE)
return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
- DAG.getConstant(-1ULL, DL, VT), Cmp1);
+ DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
- DAG.getConstant(0, DL, VT), Cmp1);
-}
-
-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // If the vector size is less than 128, or greater than the supported RegSize,
- // do not use PMADD.
- if (!VT.isVector() || VT.getVectorNumElements() < 8)
- return SDValue();
-
- SDValue Op0 = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
-
- auto UsePMADDWD = [&](SDValue Op) {
- ShrinkMode Mode;
- return Op.getOpcode() == ISD::MUL &&
- canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
- Mode != ShrinkMode::MULU16 &&
- (!Subtarget.hasSSE41() ||
- (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
- Op->isOnlyUserOf(Op.getOperand(1).getNode())));
- };
-
- SDValue MulOp, OtherOp;
- if (UsePMADDWD(Op0)) {
- MulOp = Op0;
- OtherOp = Op1;
- } else if (UsePMADDWD(Op1)) {
- MulOp = Op1;
- OtherOp = Op0;
- } else
- return SDValue();
-
- SDLoc DL(N);
- EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- VT.getVectorNumElements());
- EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- VT.getVectorNumElements() / 2);
-
- // Shrink the operands of mul.
- SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
- SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
-
- // Madd vector size is half of the original vector size
- auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
- ArrayRef<SDValue> Ops) {
- MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
- return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
- };
- SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
- PMADDWDBuilder);
- // Fill the rest of the output with 0
- SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
- SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
-
- // Preserve the reduction flag on the ADD. We may need to revisit for the
- // other operand.
- SDNodeFlags Flags;
- Flags.setVectorReduction(true);
- return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
-}
-
-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasSSE2())
- return SDValue();
-
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- // TODO: There's nothing special about i32, any integer type above i16 should
- // work just as well.
- if (!VT.isVector() || !VT.isSimple() ||
- !(VT.getVectorElementType() == MVT::i32))
- return SDValue();
-
- unsigned RegSize = 128;
- if (Subtarget.useBWIRegs())
- RegSize = 512;
- else if (Subtarget.hasAVX())
- RegSize = 256;
-
- // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
- // TODO: We should be able to handle larger vectors by splitting them before
- // feeding them into several SADs, and then reducing over those.
- if (VT.getSizeInBits() / 4 > RegSize)
- return SDValue();
-
- // We know N is a reduction add. To match SAD, we need one of the operands to
- // be an ABS.
- SDValue AbsOp = N->getOperand(0);
- SDValue OtherOp = N->getOperand(1);
- if (AbsOp.getOpcode() != ISD::ABS)
- std::swap(AbsOp, OtherOp);
- if (AbsOp.getOpcode() != ISD::ABS)
- return SDValue();
-
- // Check whether we have an abs-diff pattern feeding into the select.
- SDValue SadOp0, SadOp1;
- if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
- return SDValue();
-
- // SAD pattern detected. Now build a SAD instruction and an addition for
- // reduction. Note that the number of elements of the result of SAD is less
- // than the number of elements of its input. Therefore, we could only update
- // part of elements in the reduction vector.
- SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
-
- // The output of PSADBW is a vector of i64.
- // We need to turn the vector of i64 into a vector of i32.
- // If the reduction vector is at least as wide as the psadbw result, just
- // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
- // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
- // result to v2i32 which will be removed by type legalization. If we/ widen
- // narrow vectors then we bitcast to v4i32 and extract v2i32.
- MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
- Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-
- if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
- // Fill the upper elements with zero to match the add width.
- assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
- unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
- SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
- Ops[0] = Sad;
- Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
- } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
- Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
- DAG.getIntPtrConstant(0, DL));
- }
-
- // Preserve the reduction flag on the ADD. We may need to revisit for the
- // other operand.
- SDNodeFlags Flags;
- Flags.setVectorReduction(true);
- return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
+ DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
}
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -45020,30 +47467,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
Mode == ShrinkMode::MULU16)
return SDValue();
+ EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ VT.getVectorNumElements() * 2);
+ SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+ SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
ArrayRef<SDValue> Ops) {
- // Shrink by adding truncate nodes and let DAGCombine fold with the
- // sources.
EVT InVT = Ops[0].getValueType();
- assert(InVT.getScalarType() == MVT::i32 &&
- "Unexpected scalar element type");
assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements() / 2);
- EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
- InVT.getVectorNumElements());
- return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
- DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+ return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
};
- return SplitOpsAndApply(DAG, Subtarget, DL, VT,
- { Mul.getOperand(0), Mul.getOperand(1) },
- PMADDBuilder);
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
}
// Attempt to turn this pattern into PMADDWD.
-// (mul (add (sext (build_vector)), (sext (build_vector))),
-// (add (sext (build_vector)), (sext (build_vector)))
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+// (mul (sext (build_vector)), (sext (build_vector)))
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
const SDLoc &DL, EVT VT,
const X86Subtarget &Subtarget) {
@@ -45165,13 +47607,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- const SDNodeFlags Flags = N->getFlags();
- if (Flags.hasVectorReduction()) {
- if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
- return Sad;
- if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
- return MAdd;
- }
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
@@ -45262,6 +47697,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
SubusRHS = MinLHS;
else
return SDValue();
+ } else if (Op1.getOpcode() == ISD::TRUNCATE &&
+ Op1.getOperand(0).getOpcode() == ISD::UMIN &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16)) {
+ // Special case where the UMIN has been truncated. Try to push the truncate
+ // further up. This is similar to the i32/i64 special processing.
+ SubusLHS = Op0;
+ SDValue MinLHS = Op1.getOperand(0).getOperand(0);
+ SDValue MinRHS = Op1.getOperand(0).getOperand(1);
+ EVT TruncVT = Op1.getOperand(0).getValueType();
+ if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
+ TruncVT == MVT::v8i64)) &&
+ !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
+ return SDValue();
+ SDValue OpToSaturate;
+ if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
+ MinLHS.getOperand(0) == Op0)
+ OpToSaturate = MinRHS;
+ else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
+ MinRHS.getOperand(0) == Op0)
+ OpToSaturate = MinLHS;
+ else
+ return SDValue();
+
+ // Saturate the non-extended input and then truncate it.
+ SDLoc DL(N);
+ SDValue SaturationConst =
+ DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
+ VT.getScalarSizeInBits()),
+ DL, TruncVT);
+ SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
+ SaturationConst);
+ SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
} else
return SDValue();
@@ -45376,6 +47843,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
return DAG.getUNDEF(VT);
@@ -45386,6 +47854,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return getZeroVector(VT, Subtarget, DAG, DL);
SDValue Op0 = Ops[0];
+ bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
// Fold subvector loads into one.
// If needed, look through bitcasts to get to the load.
@@ -45402,13 +47871,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
// Repeated subvectors.
- if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
+ if (IsSplat) {
// If this broadcast/subv_broadcast is inserted into both halves, use a
// larger broadcast/subv_broadcast.
if (Op0.getOpcode() == X86ISD::VBROADCAST ||
Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
+ // If this broadcast_load is inserted into both halves, use a larger
+ // broadcast_load. Update other uses to use an extracted subvector.
+ if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+ auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
+ SDValue BcastLd = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
+ MemIntr->getMemOperand());
+ DAG.ReplaceAllUsesOfValueWith(
+ Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+ return BcastLd;
+ }
+
// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
(Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
@@ -45420,12 +47904,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
(Subtarget.hasAVX2() ||
- (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+ (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
Op0.getOperand(0).getValueType() == VT.getScalarType())
return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
- }
- bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
+ // concat_vectors(extract_subvector(broadcast(x)),
+ // extract_subvector(broadcast(x))) -> broadcast(x)
+ if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op0.getOperand(0).getValueType() == VT) {
+ if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
+ Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
+ return Op0.getOperand(0);
+ }
+ }
// Repeated opcode.
// TODO - combineX86ShufflesRecursively should handle shuffle concatenation
@@ -45435,6 +47926,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
})) {
unsigned NumOps = Ops.size();
switch (Op0.getOpcode()) {
+ case X86ISD::SHUFP: {
+ // Add SHUFPD support if/when necessary.
+ if (!IsSplat && VT.getScalarType() == MVT::f32 &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getOperand(2) == Op0.getOperand(2);
+ })) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+ Op0.getOperand(2));
+ }
+ break;
+ }
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFD:
@@ -45461,8 +47970,42 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
return DAG.getBitcast(VT, Res);
}
break;
+ case X86ISD::VSHLI:
+ case X86ISD::VSRAI:
+ case X86ISD::VSRLI:
+ if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ break;
+ case X86ISD::VPERMI:
+ case X86ISD::VROTLI:
+ case X86ISD::VROTRI:
+ if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ SmallVector<SDValue, 2> Src;
+ for (unsigned i = 0; i != NumOps; ++i)
+ Src.push_back(Ops[i].getOperand(0));
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+ Op0.getOperand(1));
+ }
+ break;
+ case X86ISD::PACKSS:
case X86ISD::PACKUS:
- if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
+ if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
+ Subtarget.hasInt256()) {
SmallVector<SDValue, 2> LHS, RHS;
for (unsigned i = 0; i != NumOps; ++i) {
LHS.push_back(Ops[i].getOperand(0));
@@ -45476,6 +48019,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
}
break;
+ case X86ISD::PALIGNR:
+ if (!IsSplat &&
+ ((VT.is256BitVector() && Subtarget.hasInt256()) ||
+ (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(2) == Op.getOperand(2);
+ })) {
+ SmallVector<SDValue, 2> LHS, RHS;
+ for (unsigned i = 0; i != NumOps; ++i) {
+ LHS.push_back(Ops[i].getOperand(0));
+ RHS.push_back(Ops[i].getOperand(1));
+ }
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+ Op0.getOperand(2));
+ }
+ break;
}
}
@@ -45565,7 +48126,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// if the insert or extract can be represented with a subregister operation.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
SubVec.getOperand(0).getSimpleValueType() == OpVT &&
- (IdxVal != 0 || !Vec.isUndef())) {
+ (IdxVal != 0 ||
+ !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
int ExtIdxVal = SubVec.getConstantOperandVal(1);
if (ExtIdxVal != 0) {
int VecNumElts = OpVT.getVectorNumElements();
@@ -45654,7 +48216,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
unsigned SelElts = SelVT.getVectorNumElements();
unsigned CastedElts = WideVT.getVectorNumElements();
- unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
+ unsigned ExtIdx = Ext->getConstantOperandVal(1);
if (SelElts % CastedElts == 0) {
// The select has the same or more (narrower) elements than the extract
// operand. The extraction index gets scaled by that factor.
@@ -45699,6 +48261,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
MVT VT = N->getSimpleValueType(0);
SDValue InVec = N->getOperand(0);
+ unsigned IdxVal = N->getConstantOperandVal(1);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -45716,7 +48279,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (isConcatenatedNot(InVecBC.getOperand(0)) ||
isConcatenatedNot(InVecBC.getOperand(1))) {
// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
- SDValue Concat = split256IntArith(InVecBC, DAG);
+ SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
}
@@ -45728,8 +48291,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
if (SDValue V = narrowExtractedVectorSelect(N, DAG))
return V;
- unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-
if (ISD::isBuildVectorAllZeros(InVec.getNode()))
return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
@@ -45779,6 +48340,43 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
}
}
+ // If we're extracting an upper subvector from a broadcast we should just
+ // extract the lowest subvector instead which should allow
+ // SimplifyDemandedVectorElts do more simplifications.
+ if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
+ InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
+ return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());
+
+ // If we're extracting a broadcasted subvector, just use the source.
+ if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
+ InVec.getOperand(0).getValueType() == VT)
+ return InVec.getOperand(0);
+
+ // Attempt to extract from the source of a shuffle vector.
+ if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
+ (IdxVal % VT.getVectorNumElements()) == 0) {
+ SmallVector<int, 32> ShuffleMask;
+ SmallVector<int, 32> ScaledMask;
+ SmallVector<SDValue, 2> ShuffleInputs;
+ unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
+ // Decode the shuffle mask and scale it so its shuffling subvectors.
+ if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
+ scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
+ unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
+ if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
+ return DAG.getUNDEF(VT);
+ if (ScaledMask[SubVecIdx] == SM_SentinelZero)
+ return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
+ SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
+ if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
+ unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
+ unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
+ return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
+ SDLoc(N), VT.getSizeInBits());
+ }
+ }
+ }
+
// If we're extracting the lowest subvector and we're the only user,
// we may be able to perform this with a smaller vector width.
if (IdxVal == 0 && InVec.hasOneUse()) {
@@ -45851,13 +48449,30 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
Src.getOperand(1));
// Reduce v2i64 to v4i32 if we don't need the upper bits.
- // TODO: Move to DAGCombine?
- if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
- Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
- Src.getOperand(0).getScalarValueSizeInBits() <= 32)
- return DAG.getBitcast(
- VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
- DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
+ // TODO: Move to DAGCombine/SimplifyDemandedBits?
+ if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+ auto IsAnyExt64 = [](SDValue Op) {
+ if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
+ return SDValue();
+ if (Op.getOpcode() == ISD::ANY_EXTEND &&
+ Op.getOperand(0).getScalarValueSizeInBits() <= 32)
+ return Op.getOperand(0);
+ if (auto *Ld = dyn_cast<LoadSDNode>(Op))
+ if (Ld->getExtensionType() == ISD::EXTLOAD &&
+ Ld->getMemoryVT().getScalarSizeInBits() <= 32)
+ return Op;
+ return SDValue();
+ };
+ if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
+ DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
+ }
+
+ // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
+ if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
+ Src.getOperand(0).getValueType() == MVT::x86mmx)
+ return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
return SDValue();
}
@@ -45928,13 +48543,16 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
auto *Ld = cast<LoadSDNode>(In);
if (Ld->isSimple()) {
MVT SVT = In.getSimpleValueType().getVectorElementType();
- ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
- EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
- VT.getVectorNumElements());
+ ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
+ ? ISD::SEXTLOAD
+ : ISD::ZEXTLOAD;
+ EVT MemVT =
+ EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
SDValue Load =
DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+ Ld->getPointerInfo(), MemVT,
+ Ld->getOriginalAlign(),
Ld->getMemOperand()->getFlags());
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
return Load;
@@ -45971,6 +48589,196 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
+// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
+// extra instructions between the conversion due to going to scalar and back.
+static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
+ return SDValue();
+
+ if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
+ return SDValue();
+
+ if (N->getValueType(0) != MVT::f32 ||
+ N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
+ return SDValue();
+
+ SDLoc dl(N);
+ SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
+ N->getOperand(0).getOperand(0));
+ Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+ Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+ DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+ return SDValue();
+
+ bool IsStrict = N->isStrictFPOpcode();
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ EVT SrcVT = Src.getValueType();
+
+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
+ return SDValue();
+
+ if (VT.getVectorElementType() != MVT::f32 &&
+ VT.getVectorElementType() != MVT::f64)
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ return SDValue();
+
+ SDLoc dl(N);
+
+ // Convert the input to vXi16.
+ EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
+ Src = DAG.getBitcast(IntVT, Src);
+
+ // Widen to at least 8 input elements.
+ if (NumElts < 8) {
+ unsigned NumConcats = 8 / NumElts;
+ SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
+ : DAG.getConstant(0, dl, IntVT);
+ SmallVector<SDValue, 4> Ops(NumConcats, Fill);
+ Ops[0] = Src;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
+ }
+
+ // Destination is vXf32 with at least 4 elements.
+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
+ std::max(4U, NumElts));
+ SDValue Cvt, Chain;
+ if (IsStrict) {
+ Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Cvt.getValue(1);
+ } else {
+ Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
+ }
+
+ if (NumElts < 4) {
+ assert(NumElts == 2 && "Unexpected size");
+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ if (IsStrict) {
+ // Extend to the original VT if necessary.
+ if (Cvt.getValueType() != VT) {
+ Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
+ {Chain, Cvt});
+ Chain = Cvt.getValue(1);
+ }
+ return DAG.getMergeValues({Cvt, Chain}, dl);
+ }
+
+ // Extend to the original VT if necessary.
+ return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
+}
+
+// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
+// cases where the loads have the same input chain and the output chains are
+// unused. This avoids any memory ordering issues.
+static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ // Only do this if the chain result is unused.
+ if (N->hasAnyUseOfValue(1))
+ return SDValue();
+
+ auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
+
+ SDValue Ptr = MemIntrin->getBasePtr();
+ SDValue Chain = MemIntrin->getChain();
+ EVT VT = N->getSimpleValueType(0);
+ EVT MemVT = MemIntrin->getMemoryVT();
+
+ // Look at other users of our base pointer and try to find a wider broadcast.
+ // The input chain and the size of the memory VT must match.
+ for (SDNode *User : Ptr->uses())
+ if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
+ cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+ cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+ cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+ MemVT.getSizeInBits() &&
+ !User->hasAnyUseOfValue(1) &&
+ User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+ SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+ VT.getSizeInBits());
+ Extract = DAG.getBitcast(VT, Extract);
+ return DCI.CombineTo(N, Extract, SDValue(User, 1));
+ }
+
+ return SDValue();
+}
+
+static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
+ SrcVT.getVectorElementType() != MVT::f32)
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ return SDValue();
+
+ SDLoc dl(N);
+
+ // Widen to at least 4 input elements.
+ if (NumElts < 4)
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getConstantFP(0.0, dl, SrcVT));
+
+ // Destination is v8i16 with at least 8 elements.
+ EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ std::max(8U, NumElts));
+ SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
+ DAG.getTargetConstant(4, dl, MVT::i32));
+
+ // Extract down to real number of elements.
+ if (NumElts < 8) {
+ EVT IntVT = VT.changeVectorElementTypeToInteger();
+ Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ return DAG.getBitcast(VT, Cvt);
+}
+
+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
+ SDValue Src = N->getOperand(0);
+
+ // Turn MOVDQ2Q+simple_load into an mmx load.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
+
+ if (LN->isSimple()) {
+ SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
+ LN->getBasePtr(),
+ LN->getPointerInfo(),
+ LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
+ return NewLd;
+ }
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -46002,8 +48810,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
case ISD::SHL: return combineShiftLeft(N, DAG);
- case ISD::SRA: return combineShiftRightArithmetic(N, DAG);
- case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI);
+ case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
+ case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
@@ -46012,6 +48820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
+ case X86ISD::VEXTRACT_STORE:
+ return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::STRICT_SINT_TO_FP:
return combineSIntToFP(N, DAG, DCI, Subtarget);
@@ -46020,14 +48830,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
- case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
+ case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
- case X86ISD::VTRUNC: return combineVTRUNC(N, DAG);
+ case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
case X86ISD::FXOR:
- case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
+ case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
case X86ISD::FMIN:
case X86ISD::FMAX: return combineFMinFMax(N, DAG);
case ISD::FMINNUM:
@@ -46036,8 +48846,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
case X86ISD::CVTP2SI:
case X86ISD::CVTP2UI:
+ case X86ISD::STRICT_CVTTP2SI:
case X86ISD::CVTTP2SI:
- case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI);
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::CVTTP2UI:
+ return combineCVTP2I_CVTTP2I(N, DAG, DCI);
+ case X86ISD::STRICT_CVTPH2PS:
+ case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
case X86ISD::BT: return combineBT(N, DAG, DCI);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
@@ -46060,12 +48875,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VSRAI:
case X86ISD::VSRLI:
return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+ case ISD::INSERT_VECTOR_ELT:
case X86ISD::PINSRB:
case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::INSERTPS:
case X86ISD::EXTRQI:
case X86ISD::INSERTQI:
+ case X86ISD::VALIGN:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -46097,12 +48914,16 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD_RND:
case X86ISD::FMSUB:
+ case X86ISD::STRICT_FMSUB:
case X86ISD::FMSUB_RND:
case X86ISD::FNMADD:
+ case X86ISD::STRICT_FNMADD:
case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB:
+ case X86ISD::STRICT_FNMSUB:
case X86ISD::FNMSUB_RND:
- case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
+ case ISD::FMA:
+ case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB:
@@ -46118,6 +48939,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
case X86ISD::KSHIFTL:
case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
+ case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
+ case ISD::STRICT_FP_EXTEND:
+ case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
+ case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
+ case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+ case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
}
return SDValue();
@@ -46266,27 +49093,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
return true;
}
-bool X86TargetLowering::
- isDesirableToCombineBuildVectorToShuffleTruncate(
- ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
-
- assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
- "Element count mismatch");
- assert(
- Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
- "Shuffle Mask expected to be legal");
-
- // For 32-bit elements VPERMD is better than shuffle+truncate.
- // TODO: After we improve lowerBuildVector, add execption for VPERMW.
- if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
- return false;
-
- if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
- return false;
-
- return true;
-}
-
//===----------------------------------------------------------------------===//
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
@@ -46327,7 +49133,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
}
bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
- InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
const std::string &AsmStr = IA->getAsmString();
@@ -46450,7 +49256,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
case 'y':
case 'x':
case 'v':
- case 'Y':
case 'l':
case 'k': // AVX512 masking registers.
return C_RegisterClass;
@@ -46487,7 +49292,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
default:
break;
case 'z':
- case '0':
return C_Register;
case 'i':
case 'm':
@@ -46543,19 +49347,17 @@ TargetLowering::ConstraintWeight
if (type->isX86_MMXTy() && Subtarget.hasMMX())
weight = CW_SpecificReg;
break;
- case 'Y': {
- unsigned Size = StringRef(constraint).size();
- // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
- char NextChar = Size == 2 ? constraint[1] : 'i';
- if (Size > 2)
+ case 'Y':
+ if (StringRef(constraint).size() != 2)
break;
- switch (NextChar) {
+ switch (constraint[1]) {
default:
return CW_Invalid;
// XMM0
case 'z':
- case '0':
- if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
+ if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+ ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
+ ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
return CW_SpecificReg;
return CW_Invalid;
// Conditional OpMask regs (AVX512)
@@ -46568,7 +49370,7 @@ TargetLowering::ConstraintWeight
if (type->isX86_MMXTy() && Subtarget.hasMMX())
return weight;
return CW_Invalid;
- // Any SSE reg when ISA >= SSE2, same as 'Y'
+ // Any SSE reg when ISA >= SSE2, same as 'x'
case 'i':
case 't':
case '2':
@@ -46576,9 +49378,7 @@ TargetLowering::ConstraintWeight
return CW_Invalid;
break;
}
- // Fall through (handle "Y" constraint).
- LLVM_FALLTHROUGH;
- }
+ break;
case 'v':
if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
weight = CW_Register;
@@ -46660,8 +49460,6 @@ LowerXConstraint(EVT ConstraintVT) const {
// FP X constraints get lowered to SSE1/2 registers if available, otherwise
// 'f' like normal targets.
if (ConstraintVT.isFloatingPoint()) {
- if (Subtarget.hasSSE2())
- return "Y";
if (Subtarget.hasSSE1())
return "x";
}
@@ -46910,26 +49708,26 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
break;
case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
if (Subtarget.is64Bit()) {
- if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, &X86::GR32RegClass);
- if (VT == MVT::i16)
- return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8RegClass);
- if (VT == MVT::i64 || VT == MVT::f64)
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16RegClass);
+ if (VT == MVT::i32 || VT == MVT::f32)
+ return std::make_pair(0U, &X86::GR32RegClass);
+ if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64RegClass);
break;
}
LLVM_FALLTHROUGH;
// 32-bit fallthrough
case 'Q': // Q_REGS
- if (VT == MVT::i32 || VT == MVT::f32)
- return std::make_pair(0U, &X86::GR32_ABCDRegClass);
- if (VT == MVT::i16)
- return std::make_pair(0U, &X86::GR16_ABCDRegClass);
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
- if (VT == MVT::i64)
+ if (VT == MVT::i16)
+ return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+ return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+ if (VT != MVT::f80)
return std::make_pair(0U, &X86::GR64_ABCDRegClass);
break;
case 'r': // GENERAL_REGS
@@ -46940,15 +49738,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::GR16RegClass);
if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32RegClass);
- return std::make_pair(0U, &X86::GR64RegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64RegClass);
+ break;
case 'R': // LEGACY_REGS
if (VT == MVT::i8 || VT == MVT::i1)
return std::make_pair(0U, &X86::GR8_NOREXRegClass);
if (VT == MVT::i16)
return std::make_pair(0U, &X86::GR16_NOREXRegClass);
- if (VT == MVT::i32 || !Subtarget.is64Bit())
+ if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
return std::make_pair(0U, &X86::GR32_NOREXRegClass);
- return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ if (VT != MVT::f80)
+ return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+ break;
case 'f': // FP Stack registers.
// If SSE is enabled for this VT, use f80 to ensure the isel moves the
// value to the correct fpstack register class.
@@ -46956,13 +49758,12 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, &X86::RFP32RegClass);
if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
return std::make_pair(0U, &X86::RFP64RegClass);
- return std::make_pair(0U, &X86::RFP80RegClass);
+ if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
+ return std::make_pair(0U, &X86::RFP80RegClass);
+ break;
case 'y': // MMX_REGS if MMX allowed.
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
- case 'Y': // SSE_REGS if SSE2 allowed
- if (!Subtarget.hasSSE2()) break;
- LLVM_FALLTHROUGH;
case 'v':
case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
if (!Subtarget.hasSSE1()) break;
@@ -46981,7 +49782,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VConstraint && Subtarget.hasVLX())
return std::make_pair(0U, &X86::FR64XRegClass);
return std::make_pair(0U, &X86::FR64RegClass);
- // TODO: Handle i128 in FR128RegClass after it is tested well.
+ case MVT::i128:
+ if (Subtarget.is64Bit()) {
+ if (VConstraint && Subtarget.hasVLX())
+ return std::make_pair(0U, &X86::VR128XRegClass);
+ return std::make_pair(0U, &X86::VR128RegClass);
+ }
+ break;
// Vector types and fp128.
case MVT::f128:
case MVT::v16i8:
@@ -47005,6 +49812,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Subtarget.hasAVX())
return std::make_pair(0U, &X86::VR256RegClass);
break;
+ case MVT::v64i8:
+ case MVT::v32i16:
case MVT::v8f64:
case MVT::v16f32:
case MVT::v16i32:
@@ -47023,14 +49832,50 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
case 'i':
case 't':
case '2':
- return getRegForInlineAsmConstraint(TRI, "Y", VT);
+ return getRegForInlineAsmConstraint(TRI, "x", VT);
case 'm':
if (!Subtarget.hasMMX()) break;
return std::make_pair(0U, &X86::VR64RegClass);
case 'z':
- case '0':
if (!Subtarget.hasSSE1()) break;
- return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+ switch (VT.SimpleTy) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(X86::XMM0, &X86::FR32RegClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(X86::XMM0, &X86::FR64RegClass);
+ case MVT::f128:
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+ // AVX types.
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ case MVT::v4i64:
+ case MVT::v8f32:
+ case MVT::v4f64:
+ if (Subtarget.hasAVX())
+ return std::make_pair(X86::YMM0, &X86::VR256RegClass);
+ break;
+ case MVT::v64i8:
+ case MVT::v32i16:
+ case MVT::v8f64:
+ case MVT::v16f32:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (Subtarget.hasAVX512())
+ return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
+ break;
+ }
+ break;
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
if (Subtarget.hasAVX512()) {
@@ -47056,7 +49901,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
- std::pair<unsigned, const TargetRegisterClass*> Res;
+ std::pair<Register, const TargetRegisterClass*> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
@@ -47127,7 +49972,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (isGRClass(*Class)) {
unsigned Size = VT.getSizeInBits();
if (Size == 1) Size = 8;
- unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+ Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
if (DestReg > 0) {
bool is64Bit = Subtarget.is64Bit();
const TargetRegisterClass *RC =
@@ -47243,8 +50088,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
- bool OptSize =
- Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+ bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
return OptSize && !VT.isVector();
}
@@ -47301,10 +50145,35 @@ bool X86TargetLowering::supportSwiftError() const {
return Subtarget.is64Bit();
}
+/// Returns true if stack probing through a function call is requested.
+bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
+ return !getStackProbeSymbolName(MF).empty();
+}
+
+/// Returns true if stack probing through inline assembly is requested.
+bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+
+ // No inline stack probe for Windows, they have their own mechanism.
+ if (Subtarget.isOSWindows() ||
+ MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
+ return false;
+
+ // If the function specifically requests inline stack probes, emit them.
+ if (MF.getFunction().hasFnAttribute("probe-stack"))
+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+ "inline-asm";
+
+ return false;
+}
+
/// Returns the name of the symbol used to emit stack probes or the empty
/// string if not applicable.
StringRef
X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+ // Inline Stack probes disable stack probe call
+ if (hasInlineStackProbe(MF))
+ return "";
+
// If the function specifically requests stack probes, emit them.
if (MF.getFunction().hasFnAttribute("probe-stack"))
return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
index 830cdfc79c0a..7f3dc90a2d73 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h
@@ -14,8 +14,6 @@
#ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
#define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
@@ -24,680 +22,809 @@ namespace llvm {
namespace X86ISD {
// X86 Specific DAG Nodes
- enum NodeType : unsigned {
- // Start the numbering where the builtin ops leave off.
- FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
- /// Bit scan forward.
- BSF,
- /// Bit scan reverse.
- BSR,
-
- /// Double shift instructions. These correspond to
- /// X86::SHLDxx and X86::SHRDxx instructions.
- SHLD,
- SHRD,
-
- /// Bitwise logical AND of floating point values. This corresponds
- /// to X86::ANDPS or X86::ANDPD.
- FAND,
-
- /// Bitwise logical OR of floating point values. This corresponds
- /// to X86::ORPS or X86::ORPD.
- FOR,
-
- /// Bitwise logical XOR of floating point values. This corresponds
- /// to X86::XORPS or X86::XORPD.
- FXOR,
-
- /// Bitwise logical ANDNOT of floating point values. This
- /// corresponds to X86::ANDNPS or X86::ANDNPD.
- FANDN,
-
- /// These operations represent an abstract X86 call
- /// instruction, which includes a bunch of information. In particular the
- /// operands of these node are:
- ///
- /// #0 - The incoming token chain
- /// #1 - The callee
- /// #2 - The number of arg bytes the caller pushes on the stack.
- /// #3 - The number of arg bytes the callee pops off the stack.
- /// #4 - The value to pass in AL/AX/EAX (optional)
- /// #5 - The value to pass in DL/DX/EDX (optional)
- ///
- /// The result values of these nodes are:
- ///
- /// #0 - The outgoing token chain
- /// #1 - The first register result value (optional)
- /// #2 - The second register result value (optional)
- ///
- CALL,
-
- /// Same as call except it adds the NoTrack prefix.
- NT_CALL,
-
- /// X86 compare and logical compare instructions.
- CMP, COMI, UCOMI,
-
- /// X86 bit-test instructions.
- BT,
-
- /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
- /// operand, usually produced by a CMP instruction.
- SETCC,
-
- /// X86 Select
- SELECTS,
-
- // Same as SETCC except it's materialized with a sbb and the value is all
- // one's or all zero's.
- SETCC_CARRY, // R = carry_bit ? ~0 : 0
-
- /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
- /// Operands are two FP values to compare; result is a mask of
- /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
- FSETCC,
-
- /// X86 FP SETCC, similar to above, but with output as an i1 mask and
- /// and a version with SAE.
- FSETCCM, FSETCCM_SAE,
-
- /// X86 conditional moves. Operand 0 and operand 1 are the two values
- /// to select from. Operand 2 is the condition code, and operand 3 is the
- /// flag operand produced by a CMP or TEST instruction.
- CMOV,
-
- /// X86 conditional branches. Operand 0 is the chain operand, operand 1
- /// is the block to branch if condition is true, operand 2 is the
- /// condition code, and operand 3 is the flag operand produced by a CMP
- /// or TEST instruction.
- BRCOND,
-
- /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
- /// operand 1 is the target address.
- NT_BRIND,
-
- /// Return with a flag operand. Operand 0 is the chain operand, operand
- /// 1 is the number of bytes of stack to pop.
- RET_FLAG,
-
- /// Return from interrupt. Operand 0 is the number of bytes to pop.
- IRET,
-
- /// Repeat fill, corresponds to X86::REP_STOSx.
- REP_STOS,
-
- /// Repeat move, corresponds to X86::REP_MOVSx.
- REP_MOVS,
-
- /// On Darwin, this node represents the result of the popl
- /// at function entry, used for PIC code.
- GlobalBaseReg,
-
- /// A wrapper node for TargetConstantPool, TargetJumpTable,
- /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
- /// MCSymbol and TargetBlockAddress.
- Wrapper,
-
- /// Special wrapper used under X86-64 PIC mode for RIP
- /// relative displacements.
- WrapperRIP,
-
- /// Copies a 64-bit value from an MMX vector to the low word
- /// of an XMM vector, with the high word zero filled.
- MOVQ2DQ,
-
- /// Copies a 64-bit value from the low word of an XMM vector
- /// to an MMX vector.
- MOVDQ2Q,
-
- /// Copies a 32-bit value from the low word of a MMX
- /// vector to a GPR.
- MMX_MOVD2W,
-
- /// Copies a GPR into the low 32-bit word of a MMX vector
- /// and zero out the high word.
- MMX_MOVW2D,
-
- /// Extract an 8-bit value from a vector and zero extend it to
- /// i32, corresponds to X86::PEXTRB.
- PEXTRB,
-
- /// Extract a 16-bit value from a vector and zero extend it to
- /// i32, corresponds to X86::PEXTRW.
- PEXTRW,
-
- /// Insert any element of a 4 x float vector into any element
- /// of a destination 4 x floatvector.
- INSERTPS,
-
- /// Insert the lower 8-bits of a 32-bit value to a vector,
- /// corresponds to X86::PINSRB.
- PINSRB,
-
- /// Insert the lower 16-bits of a 32-bit value to a vector,
- /// corresponds to X86::PINSRW.
- PINSRW,
-
- /// Shuffle 16 8-bit values within a vector.
- PSHUFB,
-
- /// Compute Sum of Absolute Differences.
- PSADBW,
- /// Compute Double Block Packed Sum-Absolute-Differences
- DBPSADBW,
-
- /// Bitwise Logical AND NOT of Packed FP values.
- ANDNP,
-
- /// Blend where the selector is an immediate.
- BLENDI,
-
- /// Dynamic (non-constant condition) vector blend where only the sign bits
- /// of the condition elements are used. This is used to enforce that the
- /// condition mask is not valid for generic VSELECT optimizations. This
- /// is also used to implement the intrinsics.
- /// Operands are in VSELECT order: MASK, TRUE, FALSE
- BLENDV,
-
- /// Combined add and sub on an FP vector.
- ADDSUB,
-
- // FP vector ops with rounding mode.
- FADD_RND, FADDS, FADDS_RND,
- FSUB_RND, FSUBS, FSUBS_RND,
- FMUL_RND, FMULS, FMULS_RND,
- FDIV_RND, FDIVS, FDIVS_RND,
- FMAX_SAE, FMAXS_SAE,
- FMIN_SAE, FMINS_SAE,
- FSQRT_RND, FSQRTS, FSQRTS_RND,
-
- // FP vector get exponent.
- FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
- // Extract Normalized Mantissas.
- VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
- // FP Scale.
- SCALEF, SCALEF_RND,
- SCALEFS, SCALEFS_RND,
-
- // Unsigned Integer average.
- AVG,
-
- /// Integer horizontal add/sub.
- HADD,
- HSUB,
-
- /// Floating point horizontal add/sub.
- FHADD,
- FHSUB,
-
- // Detect Conflicts Within a Vector
- CONFLICT,
-
- /// Floating point max and min.
- FMAX, FMIN,
-
- /// Commutative FMIN and FMAX.
- FMAXC, FMINC,
-
- /// Scalar intrinsic floating point max and min.
- FMAXS, FMINS,
-
- /// Floating point reciprocal-sqrt and reciprocal approximation.
- /// Note that these typically require refinement
- /// in order to obtain suitable precision.
- FRSQRT, FRCP,
-
- // AVX-512 reciprocal approximations with a little more precision.
- RSQRT14, RSQRT14S, RCP14, RCP14S,
-
- // Thread Local Storage.
- TLSADDR,
-
- // Thread Local Storage. A call to get the start address
- // of the TLS block for the current module.
- TLSBASEADDR,
-
- // Thread Local Storage. When calling to an OS provided
- // thunk at the address from an earlier relocation.
- TLSCALL,
+ enum NodeType : unsigned {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+ /// Bit scan forward.
+ BSF,
+ /// Bit scan reverse.
+ BSR,
+
+ /// X86 funnel/double shift i16 instructions. These correspond to
+ /// X86::SHLDW and X86::SHRDW instructions which have different amt
+ /// modulo rules to generic funnel shifts.
+ /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
+ FSHL,
+ FSHR,
+
+ /// Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// Bitwise logical ANDNOT of floating point values. This
+ /// corresponds to X86::ANDNPS or X86::ANDNPD.
+ FANDN,
+
+ /// These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ CALL,
- // Exception Handling helpers.
- EH_RETURN,
+ /// Same as call except it adds the NoTrack prefix.
+ NT_CALL,
- // SjLj exception handling setjmp.
- EH_SJLJ_SETJMP,
+ /// X86 compare and logical compare instructions.
+ CMP,
+ FCMP,
+ COMI,
+ UCOMI,
- // SjLj exception handling longjmp.
- EH_SJLJ_LONGJMP,
+ /// X86 bit-test instructions.
+ BT,
- // SjLj exception handling dispatch.
- EH_SJLJ_SETUP_DISPATCH,
+ /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+ /// operand, usually produced by a CMP instruction.
+ SETCC,
- /// Tail call return. See X86TargetLowering::LowerCall for
- /// the list of operands.
- TC_RETURN,
+ /// X86 Select
+ SELECTS,
- // Vector move to low scalar and zero higher vector elements.
- VZEXT_MOVL,
+ // Same as SETCC except it's materialized with a sbb and the value is all
+ // one's or all zero's.
+ SETCC_CARRY, // R = carry_bit ? ~0 : 0
- // Vector integer truncate.
- VTRUNC,
- // Vector integer truncate with unsigned/signed saturation.
- VTRUNCUS, VTRUNCS,
+ /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+ /// Operands are two FP values to compare; result is a mask of
+ /// 0s or 1s. Generally DTRT for C/C++ with NaNs.
+ FSETCC,
- // Masked version of the above. Used when less than a 128-bit result is
- // produced since the mask only applies to the lower elements and can't
- // be represented by a select.
- // SRC, PASSTHRU, MASK
- VMTRUNC, VMTRUNCUS, VMTRUNCS,
-
- // Vector FP extend.
- VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,
-
- // Vector FP round.
- VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,
-
- // Masked version of above. Used for v2f64->v4f32.
- // SRC, PASSTHRU, MASK
- VMFPROUND,
-
- // 128-bit vector logical left / right shift
- VSHLDQ, VSRLDQ,
-
- // Vector shift elements
- VSHL, VSRL, VSRA,
-
- // Vector variable shift
- VSHLV, VSRLV, VSRAV,
-
- // Vector shift elements by immediate
- VSHLI, VSRLI, VSRAI,
-
- // Shifts of mask registers.
- KSHIFTL, KSHIFTR,
-
- // Bit rotate by immediate
- VROTLI, VROTRI,
-
- // Vector packed double/float comparison.
- CMPP,
-
- // Vector integer comparisons.
- PCMPEQ, PCMPGT,
-
- // v8i16 Horizontal minimum and position.
- PHMINPOS,
-
- MULTISHIFT,
-
- /// Vector comparison generating mask bits for fp and
- /// integer signed and unsigned data types.
- CMPM,
- // Vector comparison with SAE for FP values
- CMPM_SAE,
-
- // Arithmetic operations with FLAGS results.
- ADD, SUB, ADC, SBB, SMUL, UMUL,
- OR, XOR, AND,
-
- // Bit field extract.
- BEXTR,
-
- // Zero High Bits Starting with Specified Bit Position.
- BZHI,
-
- // X86-specific multiply by immediate.
- MUL_IMM,
-
- // Vector sign bit extraction.
- MOVMSK,
-
- // Vector bitwise comparisons.
- PTEST,
-
- // Vector packed fp sign bitwise comparisons.
- TESTP,
-
- // OR/AND test for masks.
- KORTEST,
- KTEST,
-
- // ADD for masks.
- KADD,
-
- // Several flavors of instructions with vector shuffle behaviors.
- // Saturated signed/unnsigned packing.
- PACKSS,
- PACKUS,
- // Intra-lane alignr.
- PALIGNR,
- // AVX512 inter-lane alignr.
- VALIGN,
- PSHUFD,
- PSHUFHW,
- PSHUFLW,
- SHUFP,
- // VBMI2 Concat & Shift.
- VSHLD,
- VSHRD,
- VSHLDV,
- VSHRDV,
- //Shuffle Packed Values at 128-bit granularity.
- SHUF128,
- MOVDDUP,
- MOVSHDUP,
- MOVSLDUP,
- MOVLHPS,
- MOVHLPS,
- MOVSD,
- MOVSS,
- UNPCKL,
- UNPCKH,
- VPERMILPV,
- VPERMILPI,
- VPERMI,
- VPERM2X128,
-
- // Variable Permute (VPERM).
- // Res = VPERMV MaskV, V0
- VPERMV,
-
- // 3-op Variable Permute (VPERMT2).
- // Res = VPERMV3 V0, MaskV, V1
- VPERMV3,
-
- // Bitwise ternary logic.
- VPTERNLOG,
- // Fix Up Special Packed Float32/64 values.
- VFIXUPIMM, VFIXUPIMM_SAE,
- VFIXUPIMMS, VFIXUPIMMS_SAE,
- // Range Restriction Calculation For Packed Pairs of Float32/64 values.
- VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
- // Reduce - Perform Reduction Transformation on scalar\packed FP.
- VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
- // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
- // Also used by the legacy (V)ROUND intrinsics where we mask out the
- // scaling part of the immediate.
- VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
- // Tests Types Of a FP Values for packed types.
- VFPCLASS,
- // Tests Types Of a FP Values for scalar types.
- VFPCLASSS,
-
- // Broadcast (splat) scalar or element 0 of a vector. If the operand is
- // a vector, this node may change the vector length as part of the splat.
- VBROADCAST,
- // Broadcast mask to vector.
- VBROADCASTM,
- // Broadcast subvector to vector.
- SUBV_BROADCAST,
-
- /// SSE4A Extraction and Insertion.
- EXTRQI, INSERTQI,
-
- // XOP arithmetic/logical shifts.
- VPSHA, VPSHL,
- // XOP signed/unsigned integer comparisons.
- VPCOM, VPCOMU,
- // XOP packed permute bytes.
- VPPERM,
- // XOP two source permutation.
- VPERMIL2,
-
- // Vector multiply packed unsigned doubleword integers.
- PMULUDQ,
- // Vector multiply packed signed doubleword integers.
- PMULDQ,
- // Vector Multiply Packed UnsignedIntegers with Round and Scale.
- MULHRS,
-
- // Multiply and Add Packed Integers.
- VPMADDUBSW, VPMADDWD,
-
- // AVX512IFMA multiply and add.
- // NOTE: These are different than the instruction and perform
- // op0 x op1 + op2.
- VPMADD52L, VPMADD52H,
-
- // VNNI
- VPDPBUSD,
- VPDPBUSDS,
- VPDPWSSD,
- VPDPWSSDS,
-
- // FMA nodes.
- // We use the target independent ISD::FMA for the non-inverted case.
- FNMADD,
- FMSUB,
- FNMSUB,
- FMADDSUB,
- FMSUBADD,
-
- // FMA with rounding mode.
- FMADD_RND,
- FNMADD_RND,
- FMSUB_RND,
- FNMSUB_RND,
- FMADDSUB_RND,
- FMSUBADD_RND,
-
- // Compress and expand.
- COMPRESS,
- EXPAND,
-
- // Bits shuffle
- VPSHUFBITQMB,
-
- // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
- SINT_TO_FP_RND, UINT_TO_FP_RND,
- SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
- SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
-
- // Vector float/double to signed/unsigned integer.
- CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
- // Scalar float/double to signed/unsigned integer.
- CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,
-
- // Vector float/double to signed/unsigned integer with truncation.
- CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
- // Scalar float/double to signed/unsigned integer with truncation.
- CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,
-
- // Vector signed/unsigned integer to float/double.
- CVTSI2P, CVTUI2P,
-
- // Masked versions of above. Used for v2f64->v4f32.
- // SRC, PASSTHRU, MASK
- MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
- MCVTSI2P, MCVTUI2P,
-
- // Vector float to bfloat16.
- // Convert TWO packed single data to one packed BF16 data
- CVTNE2PS2BF16,
- // Convert packed single data to packed BF16 data
- CVTNEPS2BF16,
- // Masked version of above.
- // SRC, PASSTHRU, MASK
- MCVTNEPS2BF16,
-
- // Dot product of BF16 pairs to accumulated into
- // packed single precision.
- DPBF16PS,
-
- // Save xmm argument registers to the stack, according to %al. An operator
- // is needed so that this can be expanded with control flow.
- VASTART_SAVE_XMM_REGS,
-
- // Windows's _chkstk call to do stack probing.
- WIN_ALLOCA,
-
- // For allocating variable amounts of stack space when using
- // segmented stacks. Check if the current stacklet has enough space, and
- // falls back to heap allocation if not.
- SEG_ALLOCA,
-
- // Memory barriers.
- MEMBARRIER,
- MFENCE,
-
- // Store FP status word into i16 register.
- FNSTSW16r,
-
- // Store contents of %ah into %eflags.
- SAHF,
-
- // Get a random integer and indicate whether it is valid in CF.
- RDRAND,
-
- // Get a NIST SP800-90B & C compliant random integer and
- // indicate whether it is valid in CF.
- RDSEED,
-
- // Protection keys
- // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
- // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
- // value for ECX.
- RDPKRU, WRPKRU,
-
- // SSE42 string comparisons.
- // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
- // will emit one or two instructions based on which results are used. If
- // flags and index/mask this allows us to use a single instruction since
- // we won't have to pick and opcode for flags. Instead we can rely on the
- // DAG to CSE everything and decide at isel.
- PCMPISTR,
- PCMPESTR,
-
- // Test if in transactional execution.
- XTEST,
-
- // ERI instructions.
- RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
- RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,
-
- // Conversions between float and half-float.
- CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,
-
- // Masked version of above.
- // SRC, RND, PASSTHRU, MASK
- MCVTPS2PH,
-
- // Galois Field Arithmetic Instructions
- GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,
-
- // LWP insert record.
- LWPINS,
-
- // User level wait
- UMWAIT, TPAUSE,
-
- // Enqueue Stores Instructions
- ENQCMD, ENQCMDS,
-
- // For avx512-vp2intersect
- VP2INTERSECT,
-
- /// X86 strict FP compare instructions.
- STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
- STRICT_FCMPS,
-
- // Vector packed double/float comparison.
- STRICT_CMPP,
-
- /// Vector comparison generating mask bits for fp and
- /// integer signed and unsigned data types.
- STRICT_CMPM,
-
- // Vector float/double to signed/unsigned integer with truncation.
- STRICT_CVTTP2SI, STRICT_CVTTP2UI,
-
- // Vector FP extend.
- STRICT_VFPEXT,
-
- // Vector FP round.
- STRICT_VFPROUND,
-
- // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
- // Also used by the legacy (V)ROUND intrinsics where we mask out the
- // scaling part of the immediate.
- STRICT_VRNDSCALE,
-
- // Vector signed/unsigned integer to float/double.
- STRICT_CVTSI2P, STRICT_CVTUI2P,
-
- // Compare and swap.
- LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
- LCMPXCHG8_DAG,
- LCMPXCHG16_DAG,
- LCMPXCHG8_SAVE_EBX_DAG,
- LCMPXCHG16_SAVE_RBX_DAG,
-
- /// LOCK-prefixed arithmetic read-modify-write instructions.
- /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
- LADD, LSUB, LOR, LXOR, LAND,
-
- // Load, scalar_to_vector, and zero extend.
- VZEXT_LOAD,
-
- // extract_vector_elt, store.
- VEXTRACT_STORE,
-
- // scalar broadcast from memory
- VBROADCAST_LOAD,
-
- // Store FP control world into i16 memory.
- FNSTCW16m,
-
- /// This instruction implements FP_TO_SINT with the
- /// integer destination in memory and a FP reg source. This corresponds
- /// to the X86::FIST*m instructions and the rounding mode change stuff. It
- /// has two inputs (token chain and address) and two outputs (int value
- /// and token chain). Memory VT specifies the type to store to.
- FP_TO_INT_IN_MEM,
-
- /// This instruction implements SINT_TO_FP with the
- /// integer source in memory and FP reg result. This corresponds to the
- /// X86::FILD*m instructions. It has two inputs (token chain and address)
- /// and two outputs (FP value and token chain). FILD_FLAG also produces a
- /// flag). The integer source type is specified by the memory VT.
- FILD,
- FILD_FLAG,
-
- /// This instruction implements a fp->int store from FP stack
- /// slots. This corresponds to the fist instruction. It takes a
- /// chain operand, value to store, address, and glue. The memory VT
- /// specifies the type to store as.
- FIST,
-
- /// This instruction implements an extending load to FP stack slots.
- /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
- /// operand, and ptr to load from. The memory VT specifies the type to
- /// load from.
- FLD,
+ /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+ /// and a version with SAE.
+ FSETCCM,
+ FSETCCM_SAE,
- /// This instruction implements a truncating store from FP stack
- /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
- /// chain operand, value to store, address, and glue. The memory VT
- /// specifies the type to store as.
- FST,
-
- /// This instruction grabs the address of the next argument
- /// from a va_list. (reads and modifies the va_list in memory)
- VAARG_64,
-
- // Vector truncating store with unsigned/signed saturation
- VTRUNCSTOREUS, VTRUNCSTORES,
- // Vector truncating masked store with unsigned/signed saturation
- VMTRUNCSTOREUS, VMTRUNCSTORES,
-
- // X86 specific gather and scatter
- MGATHER, MSCATTER,
-
- // WARNING: Do not add anything in the end unless you want the node to
- // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
- // opcodes will be thought as target memory ops!
- };
+ /// X86 conditional moves. Operand 0 and operand 1 are the two values
+ /// to select from. Operand 2 is the condition code, and operand 3 is the
+ /// flag operand produced by a CMP or TEST instruction.
+ CMOV,
+
+ /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+ /// is the block to branch if condition is true, operand 2 is the
+ /// condition code, and operand 3 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+ /// operand 1 is the target address.
+ NT_BRIND,
+
+ /// Return with a flag operand. Operand 0 is the chain operand, operand
+ /// 1 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// Return from interrupt. Operand 0 is the number of bytes to pop.
+ IRET,
+
+ /// Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// A wrapper node for TargetConstantPool, TargetJumpTable,
+ /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+ /// MCSymbol and TargetBlockAddress.
+ Wrapper,
+
+ /// Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// Copies a 64-bit value from an MMX vector to the low word
+ /// of an XMM vector, with the high word zero filled.
+ MOVQ2DQ,
+
+ /// Copies a 64-bit value from the low word of an XMM vector
+ /// to an MMX vector.
+ MOVDQ2Q,
+
+ /// Copies a 32-bit value from the low word of a MMX
+ /// vector to a GPR.
+ MMX_MOVD2W,
+
+ /// Copies a GPR into the low 32-bit word of a MMX vector
+ /// and zero out the high word.
+ MMX_MOVW2D,
+
+ /// Extract an 8-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRB.
+ PEXTRB,
+
+ /// Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// Insert any element of a 4 x float vector into any element
+ /// of a destination 4 x floatvector.
+ INSERTPS,
+
+ /// Insert the lower 8-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRB.
+ PINSRB,
+
+ /// Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW,
+
+ /// Shuffle 16 8-bit values within a vector.
+ PSHUFB,
+
+ /// Compute Sum of Absolute Differences.
+ PSADBW,
+ /// Compute Double Block Packed Sum-Absolute-Differences
+ DBPSADBW,
+
+ /// Bitwise Logical AND NOT of Packed FP values.
+ ANDNP,
+
+ /// Blend where the selector is an immediate.
+ BLENDI,
+
+ /// Dynamic (non-constant condition) vector blend where only the sign bits
+ /// of the condition elements are used. This is used to enforce that the
+ /// condition mask is not valid for generic VSELECT optimizations. This
+ /// is also used to implement the intrinsics.
+ /// Operands are in VSELECT order: MASK, TRUE, FALSE
+ BLENDV,
+
+ /// Combined add and sub on an FP vector.
+ ADDSUB,
+
+ // FP vector ops with rounding mode.
+ FADD_RND,
+ FADDS,
+ FADDS_RND,
+ FSUB_RND,
+ FSUBS,
+ FSUBS_RND,
+ FMUL_RND,
+ FMULS,
+ FMULS_RND,
+ FDIV_RND,
+ FDIVS,
+ FDIVS_RND,
+ FMAX_SAE,
+ FMAXS_SAE,
+ FMIN_SAE,
+ FMINS_SAE,
+ FSQRT_RND,
+ FSQRTS,
+ FSQRTS_RND,
+
+ // FP vector get exponent.
+ FGETEXP,
+ FGETEXP_SAE,
+ FGETEXPS,
+ FGETEXPS_SAE,
+ // Extract Normalized Mantissas.
+ VGETMANT,
+ VGETMANT_SAE,
+ VGETMANTS,
+ VGETMANTS_SAE,
+ // FP Scale.
+ SCALEF,
+ SCALEF_RND,
+ SCALEFS,
+ SCALEFS_RND,
+
+ // Unsigned Integer average.
+ AVG,
+
+ /// Integer horizontal add/sub.
+ HADD,
+ HSUB,
+
+ /// Floating point horizontal add/sub.
+ FHADD,
+ FHSUB,
+
+ // Detect Conflicts Within a Vector
+ CONFLICT,
+
+ /// Floating point max and min.
+ FMAX,
+ FMIN,
+
+ /// Commutative FMIN and FMAX.
+ FMAXC,
+ FMINC,
+
+ /// Scalar intrinsic floating point max and min.
+ FMAXS,
+ FMINS,
+
+ /// Floating point reciprocal-sqrt and reciprocal approximation.
+ /// Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT,
+ FRCP,
+
+ // AVX-512 reciprocal approximations with a little more precision.
+ RSQRT14,
+ RSQRT14S,
+ RCP14,
+ RCP14S,
+
+ // Thread Local Storage.
+ TLSADDR,
+
+ // Thread Local Storage. A call to get the start address
+ // of the TLS block for the current module.
+ TLSBASEADDR,
+
+ // Thread Local Storage. When calling to an OS provided
+ // thunk at the address from an earlier relocation.
+ TLSCALL,
+
+ // Exception Handling helpers.
+ EH_RETURN,
+
+ // SjLj exception handling setjmp.
+ EH_SJLJ_SETJMP,
+
+ // SjLj exception handling longjmp.
+ EH_SJLJ_LONGJMP,
+
+ // SjLj exception handling dispatch.
+ EH_SJLJ_SETUP_DISPATCH,
+
+ /// Tail call return. See X86TargetLowering::LowerCall for
+ /// the list of operands.
+ TC_RETURN,
+
+ // Vector move to low scalar and zero higher vector elements.
+ VZEXT_MOVL,
+
+ // Vector integer truncate.
+ VTRUNC,
+ // Vector integer truncate with unsigned/signed saturation.
+ VTRUNCUS,
+ VTRUNCS,
+
+ // Masked version of the above. Used when less than a 128-bit result is
+ // produced since the mask only applies to the lower elements and can't
+ // be represented by a select.
+ // SRC, PASSTHRU, MASK
+ VMTRUNC,
+ VMTRUNCUS,
+ VMTRUNCS,
+
+ // Vector FP extend.
+ VFPEXT,
+ VFPEXT_SAE,
+ VFPEXTS,
+ VFPEXTS_SAE,
+
+ // Vector FP round.
+ VFPROUND,
+ VFPROUND_RND,
+ VFPROUNDS,
+ VFPROUNDS_RND,
+
+ // Masked version of above. Used for v2f64->v4f32.
+ // SRC, PASSTHRU, MASK
+ VMFPROUND,
+
+ // 128-bit vector logical left / right shift
+ VSHLDQ,
+ VSRLDQ,
+
+ // Vector shift elements
+ VSHL,
+ VSRL,
+ VSRA,
+
+ // Vector variable shift
+ VSHLV,
+ VSRLV,
+ VSRAV,
+
+ // Vector shift elements by immediate
+ VSHLI,
+ VSRLI,
+ VSRAI,
+
+ // Shifts of mask registers.
+ KSHIFTL,
+ KSHIFTR,
+
+ // Bit rotate by immediate
+ VROTLI,
+ VROTRI,
+
+ // Vector packed double/float comparison.
+ CMPP,
+
+ // Vector integer comparisons.
+ PCMPEQ,
+ PCMPGT,
+
+ // v8i16 Horizontal minimum and position.
+ PHMINPOS,
+
+ MULTISHIFT,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ CMPM,
+ // Vector comparison with SAE for FP values
+ CMPM_SAE,
+
+ // Arithmetic operations with FLAGS results.
+ ADD,
+ SUB,
+ ADC,
+ SBB,
+ SMUL,
+ UMUL,
+ OR,
+ XOR,
+ AND,
+
+ // Bit field extract.
+ BEXTR,
+
+ // Zero High Bits Starting with Specified Bit Position.
+ BZHI,
+
+ // Parallel extract and deposit.
+ PDEP,
+ PEXT,
+
+ // X86-specific multiply by immediate.
+ MUL_IMM,
+
+ // Vector sign bit extraction.
+ MOVMSK,
+
+ // Vector bitwise comparisons.
+ PTEST,
+
+ // Vector packed fp sign bitwise comparisons.
+ TESTP,
+
+ // OR/AND test for masks.
+ KORTEST,
+ KTEST,
+
+ // ADD for masks.
+ KADD,
+
+ // Several flavors of instructions with vector shuffle behaviors.
+ // Saturated signed/unnsigned packing.
+ PACKSS,
+ PACKUS,
+ // Intra-lane alignr.
+ PALIGNR,
+ // AVX512 inter-lane alignr.
+ VALIGN,
+ PSHUFD,
+ PSHUFHW,
+ PSHUFLW,
+ SHUFP,
+ // VBMI2 Concat & Shift.
+ VSHLD,
+ VSHRD,
+ VSHLDV,
+ VSHRDV,
+ // Shuffle Packed Values at 128-bit granularity.
+ SHUF128,
+ MOVDDUP,
+ MOVSHDUP,
+ MOVSLDUP,
+ MOVLHPS,
+ MOVHLPS,
+ MOVSD,
+ MOVSS,
+ UNPCKL,
+ UNPCKH,
+ VPERMILPV,
+ VPERMILPI,
+ VPERMI,
+ VPERM2X128,
+
+ // Variable Permute (VPERM).
+ // Res = VPERMV MaskV, V0
+ VPERMV,
+
+ // 3-op Variable Permute (VPERMT2).
+ // Res = VPERMV3 V0, MaskV, V1
+ VPERMV3,
+
+ // Bitwise ternary logic.
+ VPTERNLOG,
+ // Fix Up Special Packed Float32/64 values.
+ VFIXUPIMM,
+ VFIXUPIMM_SAE,
+ VFIXUPIMMS,
+ VFIXUPIMMS_SAE,
+ // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+ VRANGE,
+ VRANGE_SAE,
+ VRANGES,
+ VRANGES_SAE,
+ // Reduce - Perform Reduction Transformation on scalar\packed FP.
+ VREDUCE,
+ VREDUCE_SAE,
+ VREDUCES,
+ VREDUCES_SAE,
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ VRNDSCALE,
+ VRNDSCALE_SAE,
+ VRNDSCALES,
+ VRNDSCALES_SAE,
+ // Tests Types Of a FP Values for packed types.
+ VFPCLASS,
+ // Tests Types Of a FP Values for scalar types.
+ VFPCLASSS,
+
+ // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+ // a vector, this node may change the vector length as part of the splat.
+ VBROADCAST,
+ // Broadcast mask to vector.
+ VBROADCASTM,
+ // Broadcast subvector to vector.
+ SUBV_BROADCAST,
+
+ /// SSE4A Extraction and Insertion.
+ EXTRQI,
+ INSERTQI,
+
+ // XOP arithmetic/logical shifts.
+ VPSHA,
+ VPSHL,
+ // XOP signed/unsigned integer comparisons.
+ VPCOM,
+ VPCOMU,
+ // XOP packed permute bytes.
+ VPPERM,
+ // XOP two source permutation.
+ VPERMIL2,
+
+ // Vector multiply packed unsigned doubleword integers.
+ PMULUDQ,
+ // Vector multiply packed signed doubleword integers.
+ PMULDQ,
+ // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+ MULHRS,
+
+ // Multiply and Add Packed Integers.
+ VPMADDUBSW,
+ VPMADDWD,
+
+ // AVX512IFMA multiply and add.
+ // NOTE: These are different than the instruction and perform
+ // op0 x op1 + op2.
+ VPMADD52L,
+ VPMADD52H,
+
+ // VNNI
+ VPDPBUSD,
+ VPDPBUSDS,
+ VPDPWSSD,
+ VPDPWSSDS,
+
+ // FMA nodes.
+ // We use the target independent ISD::FMA for the non-inverted case.
+ FNMADD,
+ FMSUB,
+ FNMSUB,
+ FMADDSUB,
+ FMSUBADD,
+
+ // FMA with rounding mode.
+ FMADD_RND,
+ FNMADD_RND,
+ FMSUB_RND,
+ FNMSUB_RND,
+ FMADDSUB_RND,
+ FMSUBADD_RND,
+
+ // Compress and expand.
+ COMPRESS,
+ EXPAND,
+
+ // Bits shuffle
+ VPSHUFBITQMB,
+
+ // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+ SINT_TO_FP_RND,
+ UINT_TO_FP_RND,
+ SCALAR_SINT_TO_FP,
+ SCALAR_UINT_TO_FP,
+ SCALAR_SINT_TO_FP_RND,
+ SCALAR_UINT_TO_FP_RND,
+
+ // Vector float/double to signed/unsigned integer.
+ CVTP2SI,
+ CVTP2UI,
+ CVTP2SI_RND,
+ CVTP2UI_RND,
+ // Scalar float/double to signed/unsigned integer.
+ CVTS2SI,
+ CVTS2UI,
+ CVTS2SI_RND,
+ CVTS2UI_RND,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ CVTTP2SI,
+ CVTTP2UI,
+ CVTTP2SI_SAE,
+ CVTTP2UI_SAE,
+ // Scalar float/double to signed/unsigned integer with truncation.
+ CVTTS2SI,
+ CVTTS2UI,
+ CVTTS2SI_SAE,
+ CVTTS2UI_SAE,
+
+ // Vector signed/unsigned integer to float/double.
+ CVTSI2P,
+ CVTUI2P,
+
+ // Masked versions of above. Used for v2f64->v4f32.
+ // SRC, PASSTHRU, MASK
+ MCVTP2SI,
+ MCVTP2UI,
+ MCVTTP2SI,
+ MCVTTP2UI,
+ MCVTSI2P,
+ MCVTUI2P,
+
+ // Vector float to bfloat16.
+ // Convert TWO packed single data to one packed BF16 data
+ CVTNE2PS2BF16,
+ // Convert packed single data to packed BF16 data
+ CVTNEPS2BF16,
+ // Masked version of above.
+ // SRC, PASSTHRU, MASK
+ MCVTNEPS2BF16,
+
+ // Dot product of BF16 pairs to accumulated into
+ // packed single precision.
+ DPBF16PS,
+
+ // Save xmm argument registers to the stack, according to %al. An operator
+ // is needed so that this can be expanded with control flow.
+ VASTART_SAVE_XMM_REGS,
+
+ // Windows's _chkstk call to do stack probing.
+ WIN_ALLOCA,
+
+ // For allocating variable amounts of stack space when using
+ // segmented stacks. Check if the current stacklet has enough space, and
+ // falls back to heap allocation if not.
+ SEG_ALLOCA,
+
+ // For allocating stack space when using stack clash protector.
+ // Allocation is performed by block, and each block is probed.
+ PROBED_ALLOCA,
+
+ // Memory barriers.
+ MEMBARRIER,
+ MFENCE,
+
+ // Get a random integer and indicate whether it is valid in CF.
+ RDRAND,
+
+ // Get a NIST SP800-90B & C compliant random integer and
+ // indicate whether it is valid in CF.
+ RDSEED,
+
+ // Protection keys
+ // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+ // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+ // value for ECX.
+ RDPKRU,
+ WRPKRU,
+
+ // SSE42 string comparisons.
+ // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+ // will emit one or two instructions based on which results are used. If
+ // flags and index/mask this allows us to use a single instruction since
+ // we won't have to pick and opcode for flags. Instead we can rely on the
+ // DAG to CSE everything and decide at isel.
+ PCMPISTR,
+ PCMPESTR,
+
+ // Test if in transactional execution.
+ XTEST,
+
+ // ERI instructions.
+ RSQRT28,
+ RSQRT28_SAE,
+ RSQRT28S,
+ RSQRT28S_SAE,
+ RCP28,
+ RCP28_SAE,
+ RCP28S,
+ RCP28S_SAE,
+ EXP2,
+ EXP2_SAE,
+
+ // Conversions between float and half-float.
+ CVTPS2PH,
+ CVTPH2PS,
+ CVTPH2PS_SAE,
+
+ // Masked version of above.
+ // SRC, RND, PASSTHRU, MASK
+ MCVTPS2PH,
+
+ // Galois Field Arithmetic Instructions
+ GF2P8AFFINEINVQB,
+ GF2P8AFFINEQB,
+ GF2P8MULB,
+
+ // LWP insert record.
+ LWPINS,
+
+ // User level wait
+ UMWAIT,
+ TPAUSE,
+
+ // Enqueue Stores Instructions
+ ENQCMD,
+ ENQCMDS,
+
+ // For avx512-vp2intersect
+ VP2INTERSECT,
+
+ /// X86 strict FP compare instructions.
+ STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+ STRICT_FCMPS,
+
+ // Vector packed double/float comparison.
+ STRICT_CMPP,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ STRICT_CMPM,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ STRICT_CVTTP2SI,
+ STRICT_CVTTP2UI,
+
+ // Vector FP extend.
+ STRICT_VFPEXT,
+
+ // Vector FP round.
+ STRICT_VFPROUND,
+
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ STRICT_VRNDSCALE,
+
+ // Vector signed/unsigned integer to float/double.
+ STRICT_CVTSI2P,
+ STRICT_CVTUI2P,
+
+ // Strict FMA nodes.
+ STRICT_FNMADD,
+ STRICT_FMSUB,
+ STRICT_FNMSUB,
+
+ // Conversions between float and half-float.
+ STRICT_CVTPS2PH,
+ STRICT_CVTPH2PS,
+
+ // Compare and swap.
+ LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ LCMPXCHG8_DAG,
+ LCMPXCHG16_DAG,
+ LCMPXCHG8_SAVE_EBX_DAG,
+ LCMPXCHG16_SAVE_RBX_DAG,
+
+ /// LOCK-prefixed arithmetic read-modify-write instructions.
+ /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+ LADD,
+ LSUB,
+ LOR,
+ LXOR,
+ LAND,
+
+ // Load, scalar_to_vector, and zero extend.
+ VZEXT_LOAD,
+
+ // extract_vector_elt, store.
+ VEXTRACT_STORE,
+
+ // scalar broadcast from memory
+ VBROADCAST_LOAD,
+
+ // Store FP control world into i16 memory.
+ FNSTCW16m,
+
+ /// This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain). Memory VT specifies the type to store to.
+ FP_TO_INT_IN_MEM,
+
+ /// This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has two inputs (token chain and address)
+ /// and two outputs (FP value and token chain). The integer source type is
+ /// specified by the memory VT.
+ FILD,
+
+ /// This instruction implements a fp->int store from FP stack
+ /// slots. This corresponds to the fist instruction. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FIST,
+
+ /// This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, and ptr to load from. The memory VT specifies the type to
+ /// load from.
+ FLD,
+
+ /// This instruction implements a truncating store from FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and glue. The memory VT
+ /// specifies the type to store as.
+ FST,
+
+ /// This instruction grabs the address of the next argument
+ /// from a va_list. (reads and modifies the va_list in memory)
+ VAARG_64,
+
+ // Vector truncating store with unsigned/signed saturation
+ VTRUNCSTOREUS,
+ VTRUNCSTORES,
+ // Vector truncating masked store with unsigned/signed saturation
+ VMTRUNCSTOREUS,
+ VMTRUNCSTORES,
+
+ // X86 specific gather and scatter
+ MGATHER,
+ MSCATTER,
+
+ // WARNING: Do not add anything in the end unless you want the node to
+ // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+ // opcodes will be thought as target memory ops!
+ };
} // end namespace X86ISD
/// Define some predicates that are used for node matching.
@@ -717,7 +844,10 @@ namespace llvm {
/// If Op is a constant whose elements are all the same constant or
/// undefined, return true and return the constant value in \p SplatVal.
- bool isConstantSplat(SDValue Op, APInt &SplatVal);
+ /// If we have undef bits that don't cover an entire element, we treat these
+ /// as zero if AllowPartialUndefs is set, else we fail and return false.
+ bool isConstantSplat(SDValue Op, APInt &SplatVal,
+ bool AllowPartialUndefs = true);
} // end namespace X86
//===--------------------------------------------------------------------===//
@@ -756,19 +886,7 @@ namespace llvm {
unsigned getByValTypeAlignment(Type *Ty,
const DataLayout &DL) const override;
- /// Returns the target specific optimal type for load
- /// and store operations as a result of memset, memcpy, and memmove
- /// lowering. If DstAlign is zero that means it's safe to destination
- /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
- /// means there isn't a need to check it against alignment requirement,
- /// probably because the source does not need to be loaded. If 'IsMemset' is
- /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
- /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
- /// source is constant so it does not need to be loaded.
- /// It returns EVT::Other if the type should be determined using generic
- /// target-independent logic.
- EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
- bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+ EVT getOptimalMemOpType(const MemOp &Op,
const AttributeList &FuncAttributes) const override;
/// Returns true if it's safe to use load / store of the
@@ -805,19 +923,6 @@ namespace llvm {
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
- // Return true if it is profitable to combine a BUILD_VECTOR with a
- // stride-pattern to a shuffle and a truncate.
- // Example of such a combine:
- // v4i32 build_vector((extract_elt V, 1),
- // (extract_elt V, 3),
- // (extract_elt V, 5),
- // (extract_elt V, 7))
- // -->
- // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
- // v4i64)
- bool isDesirableToCombineBuildVectorToShuffleTruncate(
- ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;
-
/// Return true if the target has native support for
/// the specified value type and it is 'desirable' to use the type for the
/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
@@ -830,15 +935,12 @@ namespace llvm {
/// and some i16 instructions are slow.
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
- /// Return 1 if we can compute the negated form of the specified expression
- /// for the same cost as the expression itself, or 2 if we can compute the
- /// negated form more cheaply than the expression itself. Else return 0.
- char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
- bool ForCodeSize, unsigned Depth) const override;
-
- /// If isNegatibleForFree returns true, return the newly negated expression.
+ /// Return the newly negated expression if the cost is not expensive and
+ /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
+ /// do the negation.
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
bool LegalOperations, bool ForCodeSize,
+ NegatibleCost &Cost,
unsigned Depth) const override;
MachineBasicBlock *
@@ -934,7 +1036,8 @@ namespace llvm {
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
- bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+ const APInt &DemandedElts,
TargetLoweringOpt &TLO) const override;
/// Determine which of the bits specified in Mask are known to be either
@@ -958,6 +1061,12 @@ namespace llvm {
TargetLoweringOpt &TLO,
unsigned Depth) const override;
+ bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
+ const APInt &DemandedElts,
+ unsigned MaskIndex,
+ TargetLoweringOpt &TLO,
+ unsigned Depth) const;
+
bool SimplifyDemandedBitsForTargetNode(SDValue Op,
const APInt &DemandedBits,
const APInt &DemandedElts,
@@ -1047,6 +1156,8 @@ namespace llvm {
int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
+ /// This is used to enable splatted operand transforms for vector shifts
+ /// and vector funnel shifts.
bool isVectorShiftByScalarCheap(Type *Ty) const override;
/// Add x86-specific opcodes to the default list.
@@ -1075,6 +1186,10 @@ namespace llvm {
bool isZExtFree(EVT VT1, EVT VT2) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
+ bool shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const override;
+ bool shouldConvertPhiType(Type *From, Type *To) const override;
+
/// Return true if folding a vector load into ExtVal (a sign, zero, or any
/// extend node) is profitable.
bool isVectorLoadExtDesirable(SDValue) const override;
@@ -1171,7 +1286,8 @@ namespace llvm {
/// Overflow nodes should get combined/lowered to optimal instructions
/// (they should allow eliminating explicit compares by getting flags from
/// math ops).
- bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;
+ bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+ bool MathUsed) const override;
bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
unsigned AddrSpace) const override {
@@ -1194,12 +1310,12 @@ namespace llvm {
/// If a physical register, this returns the register that receives the
/// exception address on entry to an EH pad.
- unsigned
+ Register
getExceptionPointerRegister(const Constant *PersonalityFn) const override;
/// If a physical register, this returns the register that receives the
/// exception typeid on entry to a landing pad.
- unsigned
+ Register
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
virtual bool needsFixedCatchObjects() const override;
@@ -1227,8 +1343,10 @@ namespace llvm {
/// offset as appropriate.
Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
- std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
- SDValue StackSlot,
+ std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
+ SDValue Chain, SDValue Pointer,
+ MachinePointerInfo PtrInfo,
+ Align Alignment,
SelectionDAG &DAG) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
@@ -1236,6 +1354,8 @@ namespace llvm {
/// Customize the preferred legalization strategy for certain types.
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
+ bool softPromoteHalfType() const override { return true; }
+
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
EVT VT) const override;
@@ -1251,6 +1371,8 @@ namespace llvm {
bool supportSwiftError() const override;
+ bool hasStackProbeSymbol(MachineFunction &MF) const override;
+ bool hasInlineStackProbe(MachineFunction &MF) const override;
StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
unsigned getStackProbeSize(MachineFunction &MF) const;
@@ -1314,7 +1436,7 @@ namespace llvm {
SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
const SDLoc &dl, SelectionDAG &DAG,
const CCValAssign &VA,
- ISD::ArgFlagsTy Flags) const;
+ ISD::ArgFlagsTy Flags, bool isByval) const;
// Call lowering helpers.
@@ -1340,8 +1462,9 @@ namespace llvm {
unsigned getAddressSpace(void) const;
- SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned,
+ SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
SDValue &Chain) const;
+ SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1365,8 +1488,8 @@ namespace llvm {
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -1431,7 +1554,7 @@ namespace llvm {
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
TargetLoweringBase::AtomicExpansionKind
- shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+ shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
TargetLoweringBase::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
@@ -1464,18 +1587,15 @@ namespace llvm {
MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
- MachineBasicBlock *BB) const;
-
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
- MachineBasicBlock *BB) const;
-
MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
MachineBasicBlock *BB) const;
@@ -1497,32 +1617,25 @@ namespace llvm {
MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
MachineBasicBlock *MBB) const;
- MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
-
MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *MBB) const;
- /// Convert a comparison if required by the subtarget.
- SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
-
/// Emit flags for the given setcc condition and operands. Also returns the
/// corresponding X86 condition code constant in X86CC.
SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG,
- SDValue &X86CC, SDValue &Chain,
- bool IsSignaling) const;
+ SDValue &X86CC) const;
/// Check if replacement of SQRT with RSQRT should be disabled.
- bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+ bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
/// Use rsqrt* to speed up sqrt calculations.
- SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
int &RefinementSteps, bool &UseOneConstNR,
bool Reciprocal) const override;
/// Use rcp* to speed up fdiv calculations.
- SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+ SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
int &RefinementSteps) const override;
/// Reassociate floating point divisions into multiply by reciprocal.
@@ -1537,101 +1650,14 @@ namespace llvm {
const TargetLibraryInfo *libInfo);
} // end namespace X86
- // Base class for all X86 non-masked store operations.
- class X86StoreSDNode : public MemSDNode {
- public:
- X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
- SDVTList VTs, EVT MemVT,
- MachineMemOperand *MMO)
- :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
- const SDValue &getValue() const { return getOperand(1); }
- const SDValue &getBasePtr() const { return getOperand(2); }
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::VTRUNCSTORES ||
- N->getOpcode() == X86ISD::VTRUNCSTOREUS;
- }
- };
-
- // Base class for all X86 masked store operations.
- // The class has the same order of operands as MaskedStoreSDNode for
- // convenience.
- class X86MaskedStoreSDNode : public MemSDNode {
- public:
- X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
- const DebugLoc &dl, SDVTList VTs, EVT MemVT,
- MachineMemOperand *MMO)
- : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
-
- const SDValue &getValue() const { return getOperand(1); }
- const SDValue &getBasePtr() const { return getOperand(2); }
- const SDValue &getMask() const { return getOperand(3); }
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
- N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
- }
- };
-
- // X86 Truncating Store with Signed saturation.
- class TruncSStoreSDNode : public X86StoreSDNode {
- public:
- TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
- SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
- : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::VTRUNCSTORES;
- }
- };
-
- // X86 Truncating Store with Unsigned saturation.
- class TruncUSStoreSDNode : public X86StoreSDNode {
- public:
- TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
- SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
- : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
- }
- };
-
- // X86 Truncating Masked Store with Signed saturation.
- class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
- public:
- MaskedTruncSStoreSDNode(unsigned Order,
- const DebugLoc &dl, SDVTList VTs, EVT MemVT,
- MachineMemOperand *MMO)
- : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::VMTRUNCSTORES;
- }
- };
-
- // X86 Truncating Masked Store with Unsigned saturation.
- class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
- public:
- MaskedTruncUSStoreSDNode(unsigned Order,
- const DebugLoc &dl, SDVTList VTs, EVT MemVT,
- MachineMemOperand *MMO)
- : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
- static bool classof(const SDNode *N) {
- return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
- }
- };
-
// X86 specific Gather/Scatter nodes.
// The class has the same order of operands as MaskedGatherScatterSDNode for
// convenience.
- class X86MaskedGatherScatterSDNode : public MemSDNode {
+ class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
public:
- X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
- const DebugLoc &dl, SDVTList VTs, EVT MemVT,
- MachineMemOperand *MMO)
- : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}
+ // This is a intended as a utility and should never be directly created.
+ X86MaskedGatherScatterSDNode() = delete;
+ ~X86MaskedGatherScatterSDNode() = delete;
const SDValue &getBasePtr() const { return getOperand(3); }
const SDValue &getIndex() const { return getOperand(4); }
@@ -1646,11 +1672,6 @@ namespace llvm {
class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
public:
- X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
- EVT MemVT, MachineMemOperand *MMO)
- : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
- MMO) {}
-
const SDValue &getPassThru() const { return getOperand(1); }
static bool classof(const SDNode *N) {
@@ -1660,11 +1681,6 @@ namespace llvm {
class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
public:
- X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
- EVT MemVT, MachineMemOperand *MMO)
- : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
- MMO) {}
-
const SDValue &getValue() const { return getOperand(1); }
static bool classof(const SDNode *N) {
@@ -1673,47 +1689,15 @@ namespace llvm {
};
/// Generate unpacklo/unpackhi shuffle mask.
- template <typename T = int>
- void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
- bool Unary) {
- assert(Mask.empty() && "Expected an empty shuffle mask vector");
- int NumElts = VT.getVectorNumElements();
- int NumEltsInLane = 128 / VT.getScalarSizeInBits();
- for (int i = 0; i < NumElts; ++i) {
- unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
- int Pos = (i % NumEltsInLane) / 2 + LaneStart;
- Pos += (Unary ? 0 : NumElts * (i % 2));
- Pos += (Lo ? 0 : NumEltsInLane / 2);
- Mask.push_back(Pos);
- }
- }
-
- /// Helper function to scale a shuffle or target shuffle mask, replacing each
- /// mask index with the scaled sequential indices for an equivalent narrowed
- /// mask. This is the reverse process to canWidenShuffleElements, but can
- /// always succeed.
- template <typename T>
- void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
- SmallVectorImpl<T> &ScaledMask) {
- assert(0 < Scale && "Unexpected scaling factor");
- size_t NumElts = Mask.size();
- ScaledMask.assign(NumElts * Scale, -1);
-
- for (size_t i = 0; i != NumElts; ++i) {
- int M = Mask[i];
-
- // Repeat sentinel values in every mask element.
- if (M < 0) {
- for (size_t s = 0; s != Scale; ++s)
- ScaledMask[(Scale * i) + s] = M;
- continue;
- }
-
- // Scale mask element and increment across each mask element.
- for (size_t s = 0; s != Scale; ++s)
- ScaledMask[(Scale * i) + s] = (Scale * M) + s;
- }
- }
+ void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+ bool Unary);
+
+ /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+ /// imposed by AVX and specific to the unary pattern. Example:
+ /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+ /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+ void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 0a79b793a980..1628f85da808 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -92,9 +92,7 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
if (!CalleeFn)
return false;
AttributeList Attrs = CalleeFn->getAttributes();
- if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice))
- return true;
- return false;
+ return Attrs.hasFnAttribute(Attribute::ReturnsTwice);
}
bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
@@ -138,17 +136,38 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
if (MBB.hasAddressTaken())
Changed |= addENDBR(MBB, MBB.begin());
- // Exception handle may indirectly jump to catch pad, So we should add
- // ENDBR before catch pad instructions.
- bool EHPadIBTNeeded = MBB.isEHPad();
-
for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
if (I->isCall() && IsCallReturnTwice(I->getOperand(0)))
Changed |= addENDBR(MBB, std::next(I));
+ }
- if (EHPadIBTNeeded && I->isEHLabel()) {
+ // Exception handle may indirectly jump to catch pad, So we should add
+ // ENDBR before catch pad instructions. For SjLj exception model, it will
+ // create a new BB(new landingpad) indirectly jump to the old landingpad.
+ if (TM->Options.ExceptionModel == ExceptionHandling::SjLj) {
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ // New Landingpad BB without EHLabel.
+ if (MBB.isEHPad()) {
+ if (I->isDebugInstr())
+ continue;
+ Changed |= addENDBR(MBB, I);
+ break;
+ } else if (I->isEHLabel()) {
+ // Old Landingpad BB (is not Landingpad now) with
+ // the the old "callee" EHLabel.
+ MCSymbol *Sym = I->getOperand(0).getMCSymbol();
+ if (!MF.hasCallSiteLandingPad(Sym))
+ continue;
+ Changed |= addENDBR(MBB, std::next(I));
+ break;
+ }
+ }
+ } else if (MBB.isEHPad()){
+ for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+ if (!I->isEHLabel())
+ continue;
Changed |= addENDBR(MBB, std::next(I));
- EHPadIBTNeeded = false;
+ break;
}
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
index 36b9c3ccc959..828887d96129 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -29,6 +29,7 @@
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86Subtarget.h"
+#include "llvm/CodeGen/IndirectThunks.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
@@ -40,6 +41,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -56,23 +58,6 @@ static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
namespace {
-template <typename Derived> class ThunkInserter {
- Derived &getDerived() { return *static_cast<Derived *>(this); }
-
-protected:
- bool InsertedThunks;
- void doInitialization(Module &M) {}
- void createThunkFunction(MachineModuleInfo &MMI, StringRef Name);
-
-public:
- void init(Module &M) {
- InsertedThunks = false;
- getDerived().doInitialization(M);
- }
- // return `true` if `MMI` or `MF` was modified
- bool run(MachineModuleInfo &MMI, MachineFunction &MF);
-};
-
struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
const char *getThunkPrefix() { return RetpolineNamePrefix; }
bool mayUseThunk(const MachineFunction &MF) {
@@ -94,12 +79,9 @@ struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
createThunkFunction(MMI, R11LVIThunkName);
}
void populateThunk(MachineFunction &MF) {
- // Grab the entry MBB and erase any other blocks. O0 codegen appears to
- // generate two bbs for the entry block.
+ assert (MF.size() == 1);
MachineBasicBlock *Entry = &MF.front();
Entry->clear();
- while (MF.size() > 1)
- MF.erase(std::next(MF.begin()));
// This code mitigates LVI by replacing each indirect call/jump with a
// direct call/jump to a thunk that looks like:
@@ -128,12 +110,6 @@ public:
bool doInitialization(Module &M) override;
bool runOnMachineFunction(MachineFunction &MF) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- MachineFunctionPass::getAnalysisUsage(AU);
- AU.addRequired<MachineModuleInfoWrapperPass>();
- AU.addPreserved<MachineModuleInfoWrapperPass>();
- }
-
private:
std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
@@ -224,12 +200,9 @@ void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
}
const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- // Grab the entry MBB and erase any other blocks. O0 codegen appears to
- // generate two bbs for the entry block.
+ assert (MF.size() == 1);
MachineBasicBlock *Entry = &MF.front();
Entry->clear();
- while (MF.size() > 1)
- MF.erase(std::next(MF.begin()));
MachineBasicBlock *CaptureSpec =
MF.CreateMachineBasicBlock(Entry->getBasicBlock());
@@ -279,73 +252,6 @@ void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
}
-template <typename Derived>
-void ThunkInserter<Derived>::createThunkFunction(MachineModuleInfo &MMI,
- StringRef Name) {
- assert(Name.startswith(getDerived().getThunkPrefix()) &&
- "Created a thunk with an unexpected prefix!");
-
- Module &M = const_cast<Module &>(*MMI.getModule());
- LLVMContext &Ctx = M.getContext();
- auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
- Function *F =
- Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
- F->setVisibility(GlobalValue::HiddenVisibility);
- F->setComdat(M.getOrInsertComdat(Name));
-
- // Add Attributes so that we don't create a frame, unwind information, or
- // inline.
- AttrBuilder B;
- B.addAttribute(llvm::Attribute::NoUnwind);
- B.addAttribute(llvm::Attribute::Naked);
- F->addAttributes(llvm::AttributeList::FunctionIndex, B);
-
- // Populate our function a bit so that we can verify.
- BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
- IRBuilder<> Builder(Entry);
-
- Builder.CreateRetVoid();
-
- // MachineFunctions/MachineBasicBlocks aren't created automatically for the
- // IR-level constructs we already made. Create them and insert them into the
- // module.
- MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
- MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
-
- // Insert EntryMBB into MF. It's not in the module until we do this.
- MF.insert(MF.end(), EntryMBB);
- // Set MF properties. We never use vregs...
- MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
-}
-
-template <typename Derived>
-bool ThunkInserter<Derived>::run(MachineModuleInfo &MMI, MachineFunction &MF) {
- // If MF is not a thunk, check to see if we need to insert a thunk.
- if (!MF.getName().startswith(getDerived().getThunkPrefix())) {
- // If we've already inserted a thunk, nothing else to do.
- if (InsertedThunks)
- return false;
-
- // Only add a thunk if one of the functions has the corresponding feature
- // enabled in its subtarget, and doesn't enable external thunks.
- // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
- // nothing will end up calling it.
- // FIXME: It's a little silly to look at every function just to enumerate
- // the subtargets, but eventually we'll want to look at them for indirect
- // calls, so maybe this is OK.
- if (!getDerived().mayUseThunk(MF))
- return false;
-
- getDerived().insertThunks(MMI);
- InsertedThunks = true;
- return true;
- }
-
- // If this *is* a thunk function, we need to populate it with the correct MI.
- getDerived().populateThunk(MF);
- return true;
-}
-
FunctionPass *llvm::createX86IndirectThunksPass() {
return new X86IndirectThunks();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 2b1e3f23efd7..53925bbfd72f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) {
void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
- AU.addRequired<MachineModuleInfoWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
}
bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
new file mode 100644
index 000000000000..a82d98d88b30
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -0,0 +1,151 @@
+//- X86Insertwait.cpp - Strict-Fp:Insert wait instruction X87 instructions --//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which insert x86 wait instructions after each
+// X87 instructions when strict float is enabled.
+//
+// The logic to insert a wait instruction after an X87 instruction is as below:
+// 1. If the X87 instruction don't raise float exception nor is a load/store
+// instruction, or is a x87 control instruction, don't insert wait.
+// 2. If the X87 instruction is an instruction which the following instruction
+// is an X87 exception synchronizing X87 instruction, don't insert wait.
+// 3. For other situations, insert wait instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-insert-wait"
+
+namespace {
+
+class WaitInsert : public MachineFunctionPass {
+public:
+ static char ID;
+
+ WaitInsert() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "X86 insert wait instruction";
+ }
+
+private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+};
+
+} // namespace
+
+char WaitInsert::ID = 0;
+
+FunctionPass *llvm::createX86InsertX87waitPass() { return new WaitInsert(); }
+
+/// Return true if the Reg is X87 register.
+static bool isX87Reg(unsigned Reg) {
+ return (Reg == X86::FPCW || Reg == X86::FPSW ||
+ (Reg >= X86::ST0 && Reg <= X86::ST7));
+}
+
+/// check if the instruction is X87 instruction
+static bool isX87Instruction(MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (!MO.isReg())
+ continue;
+ if (isX87Reg(MO.getReg()))
+ return true;
+ }
+ return false;
+}
+
+static bool isX87ControlInstruction(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case X86::FNINIT:
+ case X86::FLDCW16m:
+ case X86::FNSTCW16m:
+ case X86::FNSTSW16r:
+ case X86::FNSTSWm:
+ case X86::FNCLEX:
+ case X86::FLDENVm:
+ case X86::FSTENVm:
+ case X86::FRSTORm:
+ case X86::FSAVEm:
+ case X86::FINCSTP:
+ case X86::FDECSTP:
+ case X86::FFREE:
+ case X86::FFREEP:
+ case X86::FNOP:
+ case X86::WAIT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool isX87NonWaitingControlInstruction(MachineInstr &MI) {
+ // a few special control instructions don't perform a wait operation
+ switch (MI.getOpcode()) {
+ case X86::FNINIT:
+ case X86::FNSTSW16r:
+ case X86::FNSTSWm:
+ case X86::FNSTCW16m:
+ case X86::FNCLEX:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
+ if (!MF.getFunction().hasFnAttribute(Attribute::StrictFP))
+ return false;
+
+ const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+ TII = ST.getInstrInfo();
+ bool Changed = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineBasicBlock::iterator MI = MBB.begin(); MI != MBB.end(); ++MI) {
+ // Jump non X87 instruction.
+ if (!isX87Instruction(*MI))
+ continue;
+ // If the instruction instruction neither has float exception nor is
+ // a load/store instruction, or the instruction is x87 control
+ // instruction, do not insert wait.
+ if (!(MI->mayRaiseFPException() || MI->mayLoadOrStore()) ||
+ isX87ControlInstruction(*MI))
+ continue;
+ // If the following instruction is an X87 instruction and isn't an X87
+ // non-waiting control instruction, we can omit insert wait instruction.
+ MachineBasicBlock::iterator AfterMI = std::next(MI);
+ if (AfterMI != MBB.end() && isX87Instruction(*AfterMI) &&
+ !isX87NonWaitingControlInstruction(*AfterMI))
+ continue;
+
+ BuildMI(MBB, AfterMI, MI->getDebugLoc(), TII->get(X86::WAIT));
+ LLVM_DEBUG(dbgs() << "\nInsert wait after:\t" << *MI);
+ // Jump the newly inserting wait
+ ++MI;
+ Changed = true;
+ }
+ }
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
new file mode 100644
index 000000000000..e26dd5050a23
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAMX.td
@@ -0,0 +1,119 @@
+//===---- X86InstrAMX.td - AMX Instruction Set Extension --*- tablegen -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel AMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMX instructions
+
+let Predicates = [HasAMXTILE, In64BitMode] in {
+ let SchedRW = [WriteSystem] in {
+ let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
+ "ldtilecfg\t$src",
+ [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
+ def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
+ "sttilecfg\t$src",
+ [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
+ def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src),
+ "tileloadd\t{$src, $dst|$dst, $src}", []>,
+ VEX, T8XD;
+ def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+ (ins sibmem:$src),
+ "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
+ VEX, T8PD;
+ let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+ def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
+ "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
+ def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
+ (ins sibmem:$dst, TILE:$src),
+ "tilestored\t{$src, $dst|$dst, $src}", []>,
+ VEX, T8XS;
+ def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
+ "tilezero\t$dst", []>,
+ VEX, T8XD;
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+ def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
+ sibmem:$src2), []>;
+ def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
+ def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
+ [(int_x86_tilezero imm:$src)]>;
+ }
+ } // SchedRW
+} // HasAMXTILE
+
+let Predicates = [HasAMXINT8, In64BitMode] in {
+ let SchedRW = [WriteSystem] in {
+ let Constraints = "$src1 = $dst" in {
+ def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8XD;
+ def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8XS;
+ def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8PD;
+ def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ VEX_4V, T8PS;
+ }
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbssd imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbsud imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbusd imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbuud imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ }
+ }
+} // HasAMXTILE
+
+let Predicates = [HasAMXBF16, In64BitMode] in {
+ let SchedRW = [WriteSystem] in {
+ let Constraints = "$src1 = $dst" in
+ def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
+ (ins TILE:$src1, TILE:$src2, TILE:$src3),
+ "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ []>, VEX_4V, T8XS;
+
+ let usesCustomInserter = 1 in {
+ // Pseudo instructions, using immediates instead of tile registers.
+ // To be translated to the actual instructions in X86ISelLowering.cpp
+ def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
+ u8imm:$src2, u8imm:$src3),
+ [(int_x86_tdpbf16ps imm:$src1,
+ imm:$src2, imm:$src3)]>;
+ }
+ }
+} // HasAMXTILE, HasAMXBF16
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
index 32f012033fb0..a3ad0b1c8dd6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -76,11 +76,11 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
- ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
- !cast<ComplexPattern>("sse_load_f32"),
- !if (!eq (EltTypeName, "f64"),
- !cast<ComplexPattern>("sse_load_f64"),
- ?));
+ PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"),
+ !cast<PatFrags>("sse_load_f32"),
+ !if (!eq (EltTypeName, "f64"),
+ !cast<PatFrags>("sse_load_f64"),
+ ?));
// The string to specify embedded broadcast in assembly.
string BroadcastStr = "{1to" # NumElts # "}";
@@ -169,6 +169,18 @@ def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>;
def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>;
def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>;
+// Used for matching masked operations. Ensures the operation part only has a
+// single use.
+def vselect_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2),
+ (vselect node:$mask, node:$src1, node:$src2), [{
+ return isProfitableToFormMaskedOp(N);
+}]>;
+
+def X86selects_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2),
+ (X86selects node:$mask, node:$src1, node:$src2), [{
+ return isProfitableToFormMaskedOp(N);
+}]>;
+
// This multiclass generates the masking variants from the non-masking
// variant. It only provides the assembly pieces for the masking variants.
// It assumes custom ISel patterns for masking which can be provided as
@@ -220,7 +232,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskingRHS,
- SDNode Select = vselect,
+ SDPatternOperator Select = vselect_mask,
string MaskingConstraint = "",
bit IsCommutable = 0,
bit IsKCommutable = 0,
@@ -236,35 +248,36 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
-// perserved vector elements come from a new dummy input operand tied to $dst.
+// preserved vector elements come from a new dummy input operand tied to $dst.
// This version uses a separate dag for non-masking and masking.
multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskRHS,
bit IsCommutable = 0, bit IsKCommutable = 0,
- SDNode Select = vselect> :
+ bit IsKZCommutable = IsCommutable> :
AVX512_maskable_custom<O, F, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm,
[(set _.RC:$dst, RHS)],
[(set _.RC:$dst,
- (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
+ (vselect_mask _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
[(set _.RC:$dst,
- (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
- "$src0 = $dst", IsCommutable, IsKCommutable>;
+ (vselect_mask _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
+ "$src0 = $dst", IsCommutable, IsKCommutable,
+ IsKZCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
-// perserved vector elements come from a new dummy input operand tied to $dst.
+// preserved vector elements come from a new dummy input operand tied to $dst.
multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS,
bit IsCommutable = 0, bit IsKCommutable = 0,
bit IsKZCommutable = IsCommutable,
- SDNode Select = vselect> :
+ SDPatternOperator Select = vselect_mask> :
AVX512_maskable_common<O, F, _, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
@@ -280,7 +293,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
string AttSrcAsm, string IntelSrcAsm,
dag RHS> :
AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
- RHS, 0, 0, 0, X86selects>;
+ RHS, 0, 0, 0, X86selects_mask>;
// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved
@@ -292,7 +305,7 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
dag RHS,
bit IsCommutable = 0,
bit IsKCommutable = 0,
- SDNode Select = vselect,
+ SDPatternOperator Select = vselect_mask,
bit MaskOnly = 0> :
AVX512_maskable_common<O, F, _, Outs,
!con((ins _.RC:$src1), NonTiedIns),
@@ -317,9 +330,9 @@ multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
!con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
!con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
OpcodeStr, AttSrcAsm, IntelSrcAsm, (null_frag),
- (vselect InVT.KRCWM:$mask, RHS,
+ (vselect_mask InVT.KRCWM:$mask, RHS,
(bitconvert InVT.RC:$src1)),
- vselect, "", IsCommutable>;
+ vselect_mask, "", IsCommutable>;
multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
@@ -330,7 +343,7 @@ multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
bit MaskOnly = 0> :
AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm,
IntelSrcAsm, RHS, IsCommutable, IsKCommutable,
- X86selects, MaskOnly>;
+ X86selects_mask, MaskOnly>;
multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins,
@@ -399,6 +412,36 @@ multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(and _.KRCWM:$mask, RHS_su), IsCommutable>;
+// Used by conversion instructions.
+multiclass AVX512_maskable_cvt<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs,
+ dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+ string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS, dag ZeroMaskingRHS> :
+ AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+ AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst, MaskingRHS)],
+ [(set _.RC:$dst, ZeroMaskingRHS)],
+ "$src0 = $dst">;
+
+multiclass AVX512_maskable_fma<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag NonTiedIns, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskingRHS, bit IsCommutable,
+ bit IsKCommutable> :
+ AVX512_maskable_custom<O, F, Outs,
+ !con((ins _.RC:$src1), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst,
+ (vselect_mask _.KRCWM:$mask, MaskingRHS, _.RC:$src1))],
+ [(set _.RC:$dst,
+ (vselect_mask _.KRCWM:$mask, MaskingRHS, _.ImmAllZerosV))],
+ "", IsCommutable, IsKCommutable>;
// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
@@ -625,45 +668,45 @@ multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
list<Predicate> p> {
let Predicates = p in {
def : Pat<(Cast.VT
- (vselect Cast.KRCWM:$mask,
- (bitconvert
- (vinsert_insert:$ins (To.VT To.RC:$src1),
- (From.VT From.RC:$src2),
- (iPTR imm))),
- Cast.RC:$src0)),
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))),
+ Cast.RC:$src0)),
(!cast<Instruction>(InstrStr#"rrk")
Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
(INSERT_get_vinsert_imm To.RC:$ins))>;
def : Pat<(Cast.VT
- (vselect Cast.KRCWM:$mask,
- (bitconvert
- (vinsert_insert:$ins (To.VT To.RC:$src1),
- (From.VT
- (bitconvert
- (From.LdFrag addr:$src2))),
- (iPTR imm))),
- Cast.RC:$src0)),
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT
+ (bitconvert
+ (From.LdFrag addr:$src2))),
+ (iPTR imm))),
+ Cast.RC:$src0)),
(!cast<Instruction>(InstrStr#"rmk")
Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
(INSERT_get_vinsert_imm To.RC:$ins))>;
def : Pat<(Cast.VT
- (vselect Cast.KRCWM:$mask,
- (bitconvert
- (vinsert_insert:$ins (To.VT To.RC:$src1),
- (From.VT From.RC:$src2),
- (iPTR imm))),
- Cast.ImmAllZerosV)),
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))),
+ Cast.ImmAllZerosV)),
(!cast<Instruction>(InstrStr#"rrkz")
Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
(INSERT_get_vinsert_imm To.RC:$ins))>;
def : Pat<(Cast.VT
- (vselect Cast.KRCWM:$mask,
- (bitconvert
- (vinsert_insert:$ins (To.VT To.RC:$src1),
- (From.VT (From.LdFrag addr:$src2)),
- (iPTR imm))),
- Cast.ImmAllZerosV)),
+ (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT (From.LdFrag addr:$src2)),
+ (iPTR imm))),
+ Cast.ImmAllZerosV)),
(!cast<Instruction>(InstrStr#"rmkz")
Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
(INSERT_get_vinsert_imm To.RC:$ins))>;
@@ -981,20 +1024,20 @@ multiclass vextract_for_mask_cast<string InstrStr, X86VectorVTInfo From,
SDNodeXForm EXTRACT_get_vextract_imm,
list<Predicate> p> {
let Predicates = p in {
- def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
- (bitconvert
- (To.VT (vextract_extract:$ext
- (From.VT From.RC:$src), (iPTR imm)))),
- To.RC:$src0)),
+ def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (vextract_extract:$ext
+ (From.VT From.RC:$src), (iPTR imm)))),
+ To.RC:$src0)),
(Cast.VT (!cast<Instruction>(InstrStr#"rrk")
Cast.RC:$src0, Cast.KRCWM:$mask, From.RC:$src,
(EXTRACT_get_vextract_imm To.RC:$ext)))>;
- def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
- (bitconvert
- (To.VT (vextract_extract:$ext
- (From.VT From.RC:$src), (iPTR imm)))),
- Cast.ImmAllZerosV)),
+ def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (vextract_extract:$ext
+ (From.VT From.RC:$src), (iPTR imm)))),
+ Cast.ImmAllZerosV)),
(Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
Cast.KRCWM:$mask, From.RC:$src,
(EXTRACT_get_vextract_imm To.RC:$ext)))>;
@@ -1101,18 +1144,18 @@ multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
string Name,
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
- (!cast<Instruction>(Name#DestInfo.ZSuffix#r)
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rr)
(SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
- def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
- (X86VBroadcast SrcInfo.FRC:$src),
- DestInfo.RC:$src0)),
- (!cast<Instruction>(Name#DestInfo.ZSuffix#rk)
+ def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask,
+ (X86VBroadcast SrcInfo.FRC:$src),
+ DestInfo.RC:$src0)),
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rrk)
DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
(SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
- def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
- (X86VBroadcast SrcInfo.FRC:$src),
- DestInfo.ImmAllZerosV)),
- (!cast<Instruction>(Name#DestInfo.ZSuffix#rkz)
+ def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask,
+ (X86VBroadcast SrcInfo.FRC:$src),
+ DestInfo.ImmAllZerosV)),
+ (!cast<Instruction>(Name#DestInfo.ZSuffix#rrkz)
DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
}
@@ -1128,83 +1171,83 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
SDPatternOperator UnmaskedOp = X86VBroadcast,
SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> {
let hasSideEffects = 0 in
- def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set MaskInfo.RC:$dst,
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
- DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
- def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
- (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
- !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
- "${dst} {${mask}} {z}, $src}"),
- [(set MaskInfo.RC:$dst,
- (vselect MaskInfo.KRCWM:$mask,
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
- MaskInfo.ImmAllZerosV))],
- DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
- let Constraints = "$src0 = $dst" in
- def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
- (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
- SrcInfo.RC:$src),
- !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
- "${dst} {${mask}}, $src}"),
+ def rr : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set MaskInfo.RC:$dst,
- (vselect MaskInfo.KRCWM:$mask,
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
- MaskInfo.RC:$src0))],
- DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
+ DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+ def rrkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+ let Constraints = "$src0 = $dst" in
+ def rrk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
let hasSideEffects = 0, mayLoad = 1 in
- def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
- (ins SrcInfo.ScalarMemOp:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set MaskInfo.RC:$dst,
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (UnmaskedBcastOp addr:$src)))))],
- DestInfo.ExeDomain>, T8PD, EVEX,
- EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
-
- def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
- (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
- !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
- "${dst} {${mask}} {z}, $src}"),
- [(set MaskInfo.RC:$dst,
- (vselect MaskInfo.KRCWM:$mask,
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (SrcInfo.BroadcastLdFrag addr:$src)))),
- MaskInfo.ImmAllZerosV))],
- DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
- EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+ def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedBcastOp addr:$src)))))],
+ DestInfo.ExeDomain>, T8PD, EVEX,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+ def rmkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+ "${dst} {${mask}} {z}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.ImmAllZerosV))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
let Constraints = "$src0 = $dst",
isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
- def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
- (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
- SrcInfo.ScalarMemOp:$src),
- !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
- "${dst} {${mask}}, $src}"),
- [(set MaskInfo.RC:$dst,
- (vselect MaskInfo.KRCWM:$mask,
- (MaskInfo.VT
- (bitconvert
- (DestInfo.VT
- (SrcInfo.BroadcastLdFrag addr:$src)))),
- MaskInfo.RC:$src0))],
- DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
- EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+ def rmk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+ (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+ SrcInfo.ScalarMemOp:$src),
+ !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src}"),
+ [(set MaskInfo.RC:$dst,
+ (vselect_mask MaskInfo.KRCWM:$mask,
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (SrcInfo.BroadcastLdFrag addr:$src)))),
+ MaskInfo.RC:$src0))],
+ DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+ EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
}
// Helper class to force mask and broadcast result to same type.
@@ -1267,35 +1310,38 @@ defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
X86VectorVTInfo _, SDPatternOperator OpNode,
RegisterClass SrcRC> {
+ // Fold with a mask even if it has multiple uses since it is cheap.
let ExeDomain = _.ExeDomain in
- defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins SrcRC:$src),
- "vpbroadcast"##_.Suffix, "$src", "$src",
- (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX,
- Sched<[SchedRR]>;
+ defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins SrcRC:$src),
+ "vpbroadcast"#_.Suffix, "$src", "$src",
+ (_.VT (OpNode SrcRC:$src)), /*IsCommutable*/0,
+ /*IsKCommutable*/0, /*IsKZCommutable*/0, vselect>,
+ T8PD, EVEX, Sched<[SchedRR]>;
}
multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR,
X86VectorVTInfo _, SDPatternOperator OpNode,
RegisterClass SrcRC, SubRegIndex Subreg> {
let hasSideEffects = 0, ExeDomain = _.ExeDomain in
- defm r : AVX512_maskable_custom<opc, MRMSrcReg,
- (outs _.RC:$dst), (ins GR32:$src),
- !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
- !con((ins _.KRCWM:$mask), (ins GR32:$src)),
- "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
- "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
+ defm rr : AVX512_maskable_custom<opc, MRMSrcReg,
+ (outs _.RC:$dst), (ins GR32:$src),
+ !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
+ !con((ins _.KRCWM:$mask), (ins GR32:$src)),
+ "vpbroadcast"#_.Suffix, "$src", "$src", [], [], [],
+ "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
def : Pat <(_.VT (OpNode SrcRC:$src)),
- (!cast<Instruction>(Name#r)
+ (!cast<Instruction>(Name#rr)
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
+ // Fold with a mask even if it has multiple uses since it is cheap.
def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0),
- (!cast<Instruction>(Name#rk) _.RC:$src0, _.KRCWM:$mask,
+ (!cast<Instruction>(Name#rrk) _.RC:$src0, _.KRCWM:$mask,
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV),
- (!cast<Instruction>(Name#rkz) _.KRCWM:$mask,
+ (!cast<Instruction>(Name#rrkz) _.KRCWM:$mask,
(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
}
@@ -1392,72 +1438,6 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
AVX5128IBase, EVEX;
}
-let Predicates = [HasAVX512] in {
- // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
- (VPBROADCASTQZm addr:$src)>;
-
- // FIXME this is to handle aligned extloads from i8.
- def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))),
- (VPBROADCASTDZm addr:$src)>;
-}
-
-let Predicates = [HasVLX] in {
- // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
- (VPBROADCASTQZ128m addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
- (VPBROADCASTQZ256m addr:$src)>;
-
- // FIXME this is to handle aligned extloads from i8.
- def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
- (VPBROADCASTDZ128m addr:$src)>;
- def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
- (VPBROADCASTDZ256m addr:$src)>;
-}
-let Predicates = [HasVLX, HasBWI] in {
- // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
- // This means we'll encounter truncated i32 loads; match that here.
- def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
- (VPBROADCASTWZ128m addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
- (VPBROADCASTWZ256m addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (i32 (extloadi16 addr:$src)))))),
- (VPBROADCASTWZ128m addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
- (VPBROADCASTWZ128m addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (i32 (extloadi16 addr:$src)))))),
- (VPBROADCASTWZ256m addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
- (VPBROADCASTWZ256m addr:$src)>;
-
- // FIXME this is to handle aligned extloads from i8.
- def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
- (VPBROADCASTWZ128m addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
- (VPBROADCASTWZ256m addr:$src)>;
-}
-let Predicates = [HasBWI] in {
- // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
- // This means we'll encounter truncated i32 loads; match that here.
- def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
- (VPBROADCASTWZm addr:$src)>;
- def : Pat<(v32i16 (X86VBroadcast
- (i16 (trunc (i32 (extloadi16 addr:$src)))))),
- (VPBROADCASTWZm addr:$src)>;
- def : Pat<(v32i16 (X86VBroadcast
- (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
- (VPBROADCASTWZm addr:$src)>;
-
- // FIXME this is to handle aligned extloads from i8.
- def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))),
- (VPBROADCASTWZm addr:$src)>;
-}
-
//===----------------------------------------------------------------------===//
// AVX-512 BROADCAST SUBVECTORS
//
@@ -1516,38 +1496,38 @@ def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
(VBROADCASTI32X4rm addr:$src)>;
// Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- (v16f32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (v16f32 immAllZerosV)),
(VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ VR512:$src0),
(VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
- (v16i32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (v16i32 immAllZerosV)),
(VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ VR512:$src0),
(VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
- (v8f64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+ (v8f64 immAllZerosV)),
(VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+ VR512:$src0),
(VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
- (v8i64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+ (v8i64 immAllZerosV)),
(VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+ VR512:$src0),
(VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
@@ -1569,21 +1549,21 @@ def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
(VBROADCASTI32X4Z256rm addr:$src)>;
// Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- (v8f32 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ (v8f32 immAllZerosV)),
(VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
- VR256X:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+ VR256X:$src0),
(VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
- (v8i32 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ (v8i32 immAllZerosV)),
(VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
- VR256X:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+ VR256X:$src0),
(VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
@@ -1618,21 +1598,21 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2"
EVEX_V256, EVEX_CD8<64, CD8VT2>;
// Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK4WM:$mask,
- (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- (v4f64 immAllZerosV)),
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (v4f64 immAllZerosV)),
(VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
-def : Pat<(vselect VK4WM:$mask,
- (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- VR256X:$src0),
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ VR256X:$src0),
(VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
-def : Pat<(vselect VK4WM:$mask,
- (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- (v4i64 immAllZerosV)),
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ (v4i64 immAllZerosV)),
(VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
-def : Pat<(vselect VK4WM:$mask,
- (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- VR256X:$src0),
+def : Pat<(vselect_mask VK4WM:$mask,
+ (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ VR256X:$src0),
(VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
}
@@ -1651,38 +1631,38 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
EVEX_V512, EVEX_CD8<32, CD8VT8>;
// Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
- (v16f32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+ (v16f32 immAllZerosV)),
(VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+ VR512:$src0),
(VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
- (v16i32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+ (v16i32 immAllZerosV)),
(VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
- (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+ (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+ VR512:$src0),
(VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- (v8f64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ (v8f64 immAllZerosV)),
(VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+ VR512:$src0),
(VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- (v8i64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ (v8i64 immAllZerosV)),
(VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
- (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
- VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+ (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+ VR512:$src0),
(VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
}
@@ -1836,24 +1816,27 @@ defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256,
multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
X86VectorVTInfo IdxVT,
X86VectorVTInfo CastVT> {
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86VPermt2 (_.VT _.RC:$src2),
- (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), _.RC:$src3),
- (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (X86VPermt2 (_.VT _.RC:$src2),
+ (IdxVT.VT (bitconvert
+ (CastVT.VT _.RC:$src1))),
+ _.RC:$src3),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
(!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, _.RC:$src3)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86VPermt2 _.RC:$src2,
- (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
- (_.LdFrag addr:$src3)),
- (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (X86VPermt2 _.RC:$src2,
+ (IdxVT.VT (bitconvert
+ (CastVT.VT _.RC:$src1))),
+ (_.LdFrag addr:$src3)),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
(!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (X86VPermt2 _.RC:$src2,
- (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
- (_.BroadcastLdFrag addr:$src3)),
- (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+ (X86VPermt2 _.RC:$src2,
+ (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))),
+ (_.BroadcastLdFrag addr:$src3)),
+ (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
(!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3)>;
}
@@ -2085,9 +2068,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
(ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
- (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
+ (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
timm:$cc),
- (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
+ (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
@@ -2646,13 +2629,13 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
(i32 timm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix#
+ OpcodeStr#_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclasss_su (_.VT _.RC:$src1),
@@ -2660,18 +2643,18 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##
+ OpcodeStr#_.Suffix#
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,
- (X86Vfpclasss _.ScalarIntMemCPat:$src1,
- (i32 timm:$src2)))]>,
+ (X86Vfpclasss (_.ScalarIntMemFrags addr:$src1),
+ (i32 timm:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##
+ OpcodeStr#_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
- (X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
+ (X86Vfpclasss_su (_.ScalarIntMemFrags addr:$src1),
(i32 timm:$src2))))]>,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -2686,13 +2669,13 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
(i32 timm:$src2)))]>,
Sched<[sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix#
+ OpcodeStr#_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask,
(X86Vfpclass_su (_.VT _.RC:$src1),
@@ -2700,7 +2683,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
EVEX_K, Sched<[sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix#"{"#mem#"}"#
+ OpcodeStr#_.Suffix#"{"#mem#"}"#
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(X86Vfpclass
(_.VT (_.LdFrag addr:$src1)),
@@ -2708,7 +2691,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix#"{"#mem#"}"#
+ OpcodeStr#_.Suffix#"{"#mem#"}"#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (_.LdFrag addr:$src1)),
@@ -2716,18 +2699,18 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
- _.BroadcastStr##", $dst|$dst, ${src1}"
- ##_.BroadcastStr##", $src2}",
+ OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
+ _.BroadcastStr#", $dst|$dst, ${src1}"
+ #_.BroadcastStr#", $src2}",
[(set _.KRC:$dst,(X86Vfpclass
(_.VT (_.BroadcastLdFrag addr:$src1)),
(i32 timm:$src2)))]>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
- _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
- _.BroadcastStr##", $src2}",
+ OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
+ _.BroadcastStr#", $dst {${mask}}|$dst {${mask}}, ${src1}"#
+ _.BroadcastStr#", $src2}",
[(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
(_.VT (_.BroadcastLdFrag addr:$src1)),
(i32 timm:$src2))))]>,
@@ -2979,6 +2962,8 @@ def : Pat<(vnot VK4:$src),
(COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>;
def : Pat<(vnot VK2:$src),
(COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>;
+def : Pat<(vnot VK1:$src),
+ (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK1:$src, VK16)), VK2)>;
// Mask binary operation
// - KAND, KANDN, KOR, KXNOR, KXOR
@@ -3008,8 +2993,6 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
}
-def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
-def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
// These nodes use 'vnot' instead of 'not' to support vectors.
def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
@@ -3022,7 +3005,7 @@ defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SchedWriteVecLogic.XM
defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SchedWriteVecLogic.XMM, 0>;
defm KADD : avx512_mask_binop_all<0x4A, "kadd", X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>;
-multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
+multiclass avx512_binop_pat<SDPatternOperator VOpNode,
Instruction Inst> {
// With AVX512F, 8-bit mask is promoted to 16-bit mask,
// for the DQI set, this type is legal and KxxxB instruction is used
@@ -3033,25 +3016,25 @@ multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
(COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
// All types smaller than 8 bits require conversion anyway
- def : Pat<(OpNode VK1:$src1, VK1:$src2),
+ def : Pat<(VOpNode VK1:$src1, VK1:$src2),
(COPY_TO_REGCLASS (Inst
(COPY_TO_REGCLASS VK1:$src1, VK16),
(COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
def : Pat<(VOpNode VK2:$src1, VK2:$src2),
(COPY_TO_REGCLASS (Inst
(COPY_TO_REGCLASS VK2:$src1, VK16),
- (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
+ (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>;
def : Pat<(VOpNode VK4:$src1, VK4:$src2),
(COPY_TO_REGCLASS (Inst
(COPY_TO_REGCLASS VK4:$src1, VK16),
- (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
+ (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>;
}
-defm : avx512_binop_pat<and, and, KANDWrr>;
-defm : avx512_binop_pat<vandn, andn, KANDNWrr>;
-defm : avx512_binop_pat<or, or, KORWrr>;
-defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
-defm : avx512_binop_pat<xor, xor, KXORWrr>;
+defm : avx512_binop_pat<and, KANDWrr>;
+defm : avx512_binop_pat<vandn, KANDNWrr>;
+defm : avx512_binop_pat<or, KORWrr>;
+defm : avx512_binop_pat<vxnor, KXNORWrr>;
+defm : avx512_binop_pat<xor, KXORWrr>;
// Mask unpacking
multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
@@ -3065,7 +3048,7 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
VEX_4V, VEX_L, Sched<[sched]>;
def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
- (!cast<Instruction>(NAME##rr) Src.KRC:$src2, Src.KRC:$src1)>;
+ (!cast<Instruction>(NAME#rr) Src.KRC:$src2, Src.KRC:$src1)>;
}
}
@@ -3201,8 +3184,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc)),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrri")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3219,8 +3202,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
timm:$cc), Narrow.KRC)>;
// Broadcast load.
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrmbi")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3235,8 +3218,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
addr:$src2, timm:$cc), Narrow.KRC)>;
// Commuted with broadcast load.
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
- (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc)),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrmbi")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3301,7 +3284,7 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
let Predicates = [HasAVX512] in
let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1,
SchedRW = [WriteZero] in
- def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+ def NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
[(set KRC:$dst, (VT Val))]>;
}
@@ -3409,7 +3392,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
!strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
"${dst} {${mask}}, $src1}"),
[(set _.RC:$dst, (_.VT
- (vselect _.KRCWM:$mask,
+ (vselect_mask _.KRCWM:$mask,
(_.VT (ld_frag addr:$src1)),
(_.VT _.RC:$src0))))], _.ExeDomain>,
EVEX, EVEX_K, Sched<[Sched.RM]>;
@@ -3418,18 +3401,18 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
(ins _.KRCWM:$mask, _.MemOp:$src),
OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
"${dst} {${mask}} {z}, $src}",
- [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ [(set _.RC:$dst, (_.VT (vselect_mask _.KRCWM:$mask,
(_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))],
_.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
}
def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
- (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>;
def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
- (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>;
def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
- (!cast<Instruction>(Name#_.ZSuffix##rmk) _.RC:$src0,
+ (!cast<Instruction>(Name#_.ZSuffix#rmk) _.RC:$src0,
_.KRCWM:$mask, addr:$ptr)>;
}
@@ -4286,6 +4269,17 @@ def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0)))
def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)),
(COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>;
+
+def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 VR128X:$src2))),
+ (VMOVSSZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 VR128X:$src2))),
+ (VMOVSDZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+
+def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 immAllZerosV))),
+ (VMOVSSZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 immAllZerosV))),
+ (VMOVSDZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2),
@@ -4439,8 +4433,6 @@ let Predicates = [HasAVX512] in {
(VMOV64toPQIZrr GR64:$src)>;
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
- (VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v8i32 (X86vzload32 addr:$src)),
@@ -4624,8 +4616,8 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
(_.VT (OpNode _.RC:$src1,
(_.BroadcastLdFrag addr:$src2)))>,
AVX512BIBase, EVEX_4V, EVEX_B,
@@ -4750,8 +4742,8 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
OpcodeStr,
- "${src2}"##_Brdct.BroadcastStr##", $src1",
- "$src1, ${src2}"##_Brdct.BroadcastStr,
+ "${src2}"#_Brdct.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_Brdct.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
AVX512BIBase, EVEX_4V, EVEX_B,
@@ -4822,8 +4814,8 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
OpcodeStr,
- "${src2}"##_Src.BroadcastStr##", $src1",
- "$src1, ${src2}"##_Src.BroadcastStr,
+ "${src2}"#_Src.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_Src.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
@@ -5159,26 +5151,26 @@ multiclass avx512_logical_lowering<string InstrStr, SDNode OpNode,
X86VectorVTInfo _,
X86VectorVTInfo IntInfo> {
// Masked register-register logical operations.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
_.RC:$src0)),
(!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
_.RC:$src1, _.RC:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
_.ImmAllZerosV)),
(!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
_.RC:$src2)>;
// Masked register-memory logical operations.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(bitconvert (IntInfo.VT (OpNode _.RC:$src1,
(load addr:$src2)))),
_.RC:$src0)),
(!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
_.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(bitconvert (IntInfo.VT (OpNode _.RC:$src1,
(load addr:$src2)))),
_.ImmAllZerosV)),
@@ -5190,14 +5182,14 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
X86VectorVTInfo _,
X86VectorVTInfo IntInfo> {
// Register-broadcast logical operations.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(bitconvert
(IntInfo.VT (OpNode _.RC:$src1,
(IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
_.RC:$src0)),
(!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
_.RC:$src1, addr:$src2)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(bitconvert
(IntInfo.VT (OpNode _.RC:$src1,
(IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
@@ -5304,7 +5296,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
- _.ScalarIntMemCPat:$src2))>,
+ (_.ScalarIntMemFrags addr:$src2)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -5350,7 +5342,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
- _.ScalarIntMemCPat:$src2))>,
+ (_.ScalarIntMemFrags addr:$src2)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
let isCodeGenOnly = 1, Predicates = [HasAVX512],
@@ -5463,28 +5455,32 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
EVEX_CD8<64, CD8VT1>, SIMD_EXC;
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ SDPatternOperator MaskOpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
bit IsCommutable,
bit IsKCommutable = IsCommutable> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0,
Uses = [MXCSR], mayRaiseFPException = 1 in {
- defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ defm rr: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+ (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
IsKCommutable, IsKCommutable>,
EVEX_4V, Sched<[sched]>;
let mayLoad = 1 in {
- defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
+ (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
- defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
- (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
+ defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
+ (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
+ (MaskOpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
EVEX_4V, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5496,7 +5492,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#_.Suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>,
EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
@@ -5507,38 +5503,39 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
EVEX_4V, EVEX_B, Sched<[sched]>;
}
multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+ SDPatternOperator MaskOpNode,
Predicate prd, X86SchedWriteSizes sched,
bit IsCommutable = 0,
bit IsPD128Commutable = IsCommutable> {
let Predicates = [prd] in {
- defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
+ defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
+ defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f64_info,
sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
}
// Define only if AVX512VL feature is present.
let Predicates = [prd, HasVLX] in {
- defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
+ defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
sched.PS.XMM, IsCommutable>, EVEX_V128, PS,
EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
+ defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
+ defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v2f64x_info,
sched.PD.XMM, IsPD128Commutable,
IsCommutable>, EVEX_V128, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
+ defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f64x_info,
sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W,
EVEX_CD8<64, CD8VF>;
}
@@ -5566,38 +5563,38 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
-defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, HasAVX512,
+defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512,
SchedWriteFAddSizes, 1>,
avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
-defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, HasAVX512,
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512,
SchedWriteFMulSizes, 1>,
avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, HasAVX512,
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512,
SchedWriteFAddSizes>,
avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, HasAVX512,
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512,
SchedWriteFDivSizes>,
avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
-defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512,
SchedWriteFCmpSizes, 0>,
avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>;
-defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512,
SchedWriteFCmpSizes, 0>,
avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>;
let isCodeGenOnly = 1 in {
- defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
+ defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512,
SchedWriteFCmpSizes, 1>;
- defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
+ defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512,
SchedWriteFCmpSizes, 1>;
}
let Uses = []<Register>, mayRaiseFPException = 0 in {
-defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
+defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
-defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
+defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, null_frag, HasDQI,
SchedWriteFLogicSizes, 0>;
-defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
+defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
-defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
+defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
}
@@ -5605,19 +5602,19 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
EVEX_4V, Sched<[sched]>;
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
(OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
+ (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -5627,14 +5624,14 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
Sched<[sched]>;
defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr#_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>,
+ (OpNode _.RC:$src1, (_.ScalarIntMemFrags addr:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -5648,11 +5645,11 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
X86scalefsRnd, sched.Scl>,
EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
- avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info,
+ avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
X86scalefsRnd, sched.Scl>,
EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
@@ -5679,7 +5676,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
// NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
- // There are just too many permuations due to commutability and bitcasts.
+ // There are just too many permutations due to commutability and bitcasts.
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -5701,8 +5698,8 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
(null_frag), (null_frag)>,
EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -5790,7 +5787,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
let ExeDomain = _.ExeDomain in
defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
- "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
+ "$src2, ${src1}"#_.BroadcastStr, "${src1}"#_.BroadcastStr#", $src2",
(_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>,
EVEX_B, Sched<[sched.Folded]>;
}
@@ -5973,8 +5970,8 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
let ExeDomain = _.ExeDomain in
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
(_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -6245,8 +6242,8 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
- "${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr,
+ "${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr,
(_.VT (OpNode
_.RC:$src1,
(Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
@@ -6370,9 +6367,6 @@ defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
let Predicates = [HasAVX512] in {
// VMOVHPD patterns
- def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
- (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
- (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
@@ -6419,29 +6413,33 @@ let Predicates = [HasAVX512] in {
//
multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched,
+ SDNode MaskOpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
Uses = [MXCSR], mayRaiseFPException = 1 in {
- defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
+ (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
AVX512FMA3Base, Sched<[sched]>;
- defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
+ (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
- defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
+ _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
+ (MaskOpNode _.RC:$src2,
_.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
- AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6450,74 +6448,88 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
Uses = [MXCSR] in
- defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
(_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _, string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.ZMM,
- _.info512, Suff>,
+ defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512, Suff>,
avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
_.info512, Suff>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.YMM,
- _.info256, Suff>,
+ defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.XMM,
- _.info128, Suff>,
+ defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd> {
- defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- SchedWriteFMA, avx512vl_f32_info, "PS">;
- defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- SchedWriteFMA, avx512vl_f64_info, "PD">,
- VEX_W;
-}
-
-defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
-defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
-defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
+ SDNode MaskOpNode, SDNode OpNodeRnd> {
+ defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd,
+ X86Fmadd, X86FmaddRnd>;
+defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub,
+ X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub,
+ X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd,
+ X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86any_Fnmadd,
+ X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub,
+ X86Fnmsub, X86FnmsubRnd>;
multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched,
+ SDNode MaskOpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
Uses = [MXCSR], mayRaiseFPException = 1 in {
- defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1,
- vselect, 1>, AVX512FMA3Base, Sched<[sched]>;
+ (null_frag),
+ (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+ AVX512FMA3Base, Sched<[sched]>;
- defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
+ (_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
- defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
- OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
- "$src2, ${src3}"##_.BroadcastStr,
+ OpcodeStr, "${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr,
(_.VT (OpNode _.RC:$src2,
(_.VT (_.BroadcastLdFrag addr:$src3)),
- _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
+ _.RC:$src1)),
+ (_.VT (MaskOpNode _.RC:$src2,
+ (_.VT (_.BroadcastLdFrag addr:$src3)),
+ _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6527,77 +6539,89 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
Uses = [MXCSR] in
- defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
- 1, 1, vselect, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+ (null_frag),
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
+ 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _, string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.ZMM,
- _.info512, Suff>,
+ defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512, Suff>,
avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
_.info512, Suff>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.YMM,
- _.info256, Suff>,
+ defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.XMM,
- _.info128, Suff>,
+ defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd > {
- defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- SchedWriteFMA, avx512vl_f32_info, "PS">;
- defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- SchedWriteFMA, avx512vl_f64_info, "PD">,
- VEX_W;
-}
-
-defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
-defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
-defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
+ SDNode MaskOpNode, SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd,
+ X86Fmadd, X86FmaddRnd>;
+defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub,
+ X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub,
+ X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd,
+ X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86any_Fnmadd,
+ X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub,
+ X86Fnmsub, X86FnmsubRnd>;
multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched,
+ SDNode MaskOpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
Uses = [MXCSR], mayRaiseFPException = 1 in {
- defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>,
+ (null_frag),
+ (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
AVX512FMA3Base, Sched<[sched]>;
// Pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
- defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
+ (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
+ (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
// Pattern is 312 order so that the load is in a different place from the
// 213 and 231 patterns this helps tablegen's duplicate pattern detection.
- defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
- OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
- "$src2, ${src3}"##_.BroadcastStr,
+ OpcodeStr, "${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr,
(_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
- _.RC:$src1, _.RC:$src2)), 1, 0>,
+ _.RC:$src1, _.RC:$src2)),
+ (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
+ _.RC:$src1, _.RC:$src2)), 1, 0>,
AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -6607,49 +6631,57 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
Uses = [MXCSR] in
- defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
- 1, 1, vselect, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+ (null_frag),
+ (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
+ 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
}
multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched,
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _, string Suff> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.ZMM,
- _.info512, Suff>,
+ defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512, Suff>,
avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
_.info512, Suff>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasAVX512] in {
- defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.YMM,
- _.info256, Suff>,
+ defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256, Suff>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.XMM,
- _.info128, Suff>,
+ defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128, Suff>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd > {
- defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
- SchedWriteFMA, avx512vl_f32_info, "PS">;
- defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
- SchedWriteFMA, avx512vl_f64_info, "PD">,
- VEX_W;
-}
-
-defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
-defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
-defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
+ SDNode MaskOpNode, SDNode OpNodeRnd > {
+ defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f32_info, "PS">;
+ defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+ OpNodeRnd, SchedWriteFMA,
+ avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd,
+ X86Fmadd, X86FmaddRnd>;
+defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub,
+ X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub,
+ X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd,
+ X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86any_Fnmadd,
+ X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub,
+ X86Fnmsub, X86FnmsubRnd>;
// Scalar FMA
multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -6742,11 +6774,12 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
}
defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
-multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
+multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp,
+ SDNode RndOp, string Prefix,
string Suffix, SDNode Move,
X86VectorVTInfo _, PatLeaf ZeroFP> {
let Predicates = [HasAVX512] in {
@@ -6788,8 +6821,8 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
addr:$src3)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2,
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
_.FRC:$src3),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
@@ -6799,8 +6832,8 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2,
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
(_.ScalarLdFrag addr:$src3)),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
@@ -6809,18 +6842,18 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
(!cast<I>(Prefix#"132"#Suffix#"Zm_Intk")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2, _.FRC:$src3,
- (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
(!cast<I>(Prefix#"231"#Suffix#"Zr_Intk")
VR128X:$src1, VK1WM:$mask,
@@ -6828,19 +6861,19 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
- (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
(!cast<I>(Prefix#"231"#Suffix#"Zm_Intk")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2,
- (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src3),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src3),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz")
VR128X:$src1, VK1WM:$mask,
@@ -6848,9 +6881,9 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2, _.FRC:$src3,
- (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, _.FRC:$src3,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz")
VR128X:$src1, VK1WM:$mask,
@@ -6858,28 +6891,28 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2,
- (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (_.ScalarLdFrag addr:$src3)),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2,
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src3)),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz")
VR128X:$src1, VK1WM:$mask,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
- (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+ (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
(_.EltVT ZeroFP)))))),
(!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz")
VR128X:$src1, VK1WM:$mask,
@@ -6903,7 +6936,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
+ (X86selects_mask VK1WM:$mask,
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
_.FRC:$src3, (i32 timm:$rc)),
@@ -6914,7 +6947,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
+ (X86selects_mask VK1WM:$mask,
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
(i32 timm:$rc)),
@@ -6925,7 +6958,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
+ (X86selects_mask VK1WM:$mask,
(RndOp _.FRC:$src2,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
_.FRC:$src3, (i32 timm:$rc)),
@@ -6936,7 +6969,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
(_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
- (X86selects VK1WM:$mask,
+ (X86selects_mask VK1WM:$mask,
(RndOp _.FRC:$src2, _.FRC:$src3,
(_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
(i32 timm:$rc)),
@@ -6948,23 +6981,23 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
}
}
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS",
- X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS",
- X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
- X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
- X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
+ "SS", X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD",
- X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD",
- X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD",
- X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD",
- X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
+ "SD", X86Movsd, v2f64x_info, fp64imm0>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
@@ -7194,7 +7227,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstVT.RC:$dst, (OpNode
- (SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>,
+ (SrcVT.ScalarIntMemFrags addr:$src)))]>,
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
} // Predicates = [HasAVX512]
@@ -7233,6 +7266,45 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2u
X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
+ X86VectorVTInfo DstVT, SDNode OpNode,
+ X86FoldableSchedWrite sched,
+ string aliasStr> {
+ let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
+ let isCodeGenOnly = 1 in {
+ def rr : AVX512<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.FRC:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode SrcVT.FRC:$src))]>,
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ def rm : AVX512<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+ !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.ScalarLdFrag addr:$src)))]>,
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
+ } // Predicates = [HasAVX512]
+}
+
+defm VCVTSS2SIZ: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i32x_info,
+ lrint, WriteCvtSS2I,
+ "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i64x_info,
+ llrint, WriteCvtSS2I,
+ "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i32x_info,
+ lrint, WriteCvtSD2I,
+ "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i64x_info,
+ llrint, WriteCvtSD2I,
+ "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64Zrr FR32:$src)>;
+ def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64Zrm addr:$src)>;
+
+ def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64Zrr FR64:$src)>;
+ def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64Zrm addr:$src)>;
+}
+
// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
// which produce unnecessary vmovs{s,d} instructions
let Predicates = [HasAVX512] in {
@@ -7347,7 +7419,7 @@ let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
(ins _SrcRC.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst,
- (OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>,
+ (OpNodeInt (_SrcRC.ScalarIntMemFrags addr:$src)))]>,
EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
} //HasAVX512
@@ -7404,7 +7476,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
- (_Src.VT _Src.ScalarIntMemCPat:$src2)))>,
+ (_Src.ScalarIntMemFrags addr:$src2)))>,
EVEX_4V, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -7421,7 +7493,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
}
}
-// Scalar Coversion with SAE - suppress all exceptions
+// Scalar Conversion with SAE - suppress all exceptions
multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
@@ -7506,55 +7578,63 @@ def : Pat<(v2f64 (X86Movsd
//===----------------------------------------------------------------------===//
multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNode,
+ X86VectorVTInfo _Src, SDNode OpNode, SDNode MaskOpNode,
X86FoldableSchedWrite sched,
string Broadcast = _.BroadcastStr,
string Alias = "", X86MemOperand MemOp = _Src.MemOp,
RegisterClass MaskRC = _.KRCWM,
- dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
+ dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src)))),
+ dag MaskLdDAG = (_.VT (MaskOpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
let Uses = [MXCSR], mayRaiseFPException = 1 in {
- defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rr : AVX512_maskable_cvt<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src),
(ins _.RC:$src0, MaskRC:$mask, _Src.RC:$src),
(ins MaskRC:$mask, _Src.RC:$src),
OpcodeStr, "$src", "$src",
(_.VT (OpNode (_Src.VT _Src.RC:$src))),
- (vselect MaskRC:$mask,
- (_.VT (OpNode (_Src.VT _Src.RC:$src))),
- _.RC:$src0),
- vselect, "$src0 = $dst">,
+ (vselect_mask MaskRC:$mask,
+ (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))),
+ _.RC:$src0),
+ (vselect_mask MaskRC:$mask,
+ (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))),
+ _.ImmAllZerosV)>,
EVEX, Sched<[sched]>;
- defm rm : AVX512_maskable_common<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm rm : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins MemOp:$src),
(ins _.RC:$src0, MaskRC:$mask, MemOp:$src),
(ins MaskRC:$mask, MemOp:$src),
OpcodeStr#Alias, "$src", "$src",
LdDAG,
- (vselect MaskRC:$mask, LdDAG, _.RC:$src0),
- vselect, "$src0 = $dst">,
+ (vselect_mask MaskRC:$mask, MaskLdDAG, _.RC:$src0),
+ (vselect_mask MaskRC:$mask, MaskLdDAG, _.ImmAllZerosV)>,
EVEX, Sched<[sched.Folded]>;
- defm rmb : AVX512_maskable_common<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm rmb : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _Src.ScalarMemOp:$src),
(ins _.RC:$src0, MaskRC:$mask, _Src.ScalarMemOp:$src),
(ins MaskRC:$mask, _Src.ScalarMemOp:$src),
OpcodeStr,
- "${src}"##Broadcast, "${src}"##Broadcast,
+ "${src}"#Broadcast, "${src}"#Broadcast,
(_.VT (OpNode (_Src.VT
(_Src.BroadcastLdFrag addr:$src))
)),
- (vselect MaskRC:$mask,
- (_.VT
- (OpNode
- (_Src.VT
- (_Src.BroadcastLdFrag addr:$src)))),
- _.RC:$src0),
- vselect, "$src0 = $dst">,
+ (vselect_mask MaskRC:$mask,
+ (_.VT
+ (MaskOpNode
+ (_Src.VT
+ (_Src.BroadcastLdFrag addr:$src)))),
+ _.RC:$src0),
+ (vselect_mask MaskRC:$mask,
+ (_.VT
+ (MaskOpNode
+ (_Src.VT
+ (_Src.BroadcastLdFrag addr:$src)))),
+ _.ImmAllZerosV)>,
EVEX, EVEX_B, Sched<[sched.Folded]>;
}
}
-// Coversion with SAE - suppress all exceptions
+// Conversion with SAE - suppress all exceptions
multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
@@ -7581,12 +7661,14 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
// Similar to avx512_vcvt_fp, but uses an extload for the memory form.
multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNode,
+ SDNode MaskOpNode,
X86FoldableSchedWrite sched,
string Broadcast = _.BroadcastStr,
string Alias = "", X86MemOperand MemOp = _Src.MemOp,
RegisterClass MaskRC = _.KRCWM>
- : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, sched, Broadcast, Alias,
- MemOp, MaskRC,
+ : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, MaskOpNode, sched, Broadcast,
+ Alias, MemOp, MaskRC,
+ (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src)),
(_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>;
// Extend Float to Double
@@ -7594,69 +7676,72 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
- any_fpextend, sched.ZMM>,
+ any_fpextend, fpextend, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
X86vfpextSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
- X86any_vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, any_fpextend,
- sched.YMM>, EVEX_V256;
+ X86any_vfpext, X86vfpext, sched.XMM, "{1to2}",
+ "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info,
+ any_fpextend, fpextend, sched.YMM>, EVEX_V256;
}
}
// Truncate Double to Float
multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86any_vfpround, sched.ZMM>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info,
+ X86any_vfpround, X86vfpround, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
X86vfproundRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
- null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>,
- EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86any_vfpround,
+ null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
+ f128mem, VK2WM>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info,
+ X86any_vfpround, X86vfpround,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
}
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
VK2WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
VK2WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
VK4WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
VK4WM:$mask, f64mem:$src), 0, "att">;
@@ -7701,81 +7786,91 @@ let Predicates = [HasVLX] in {
// Convert Signed/Unsigned Doubleword to Double
let Uses = []<Register>, mayRaiseFPException = 0 in
multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNode128,
+ SDNode MaskOpNode128,
+ X86SchedWriteWidths sched> {
// No rounding in this op
let Predicates = [HasAVX512] in
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode,
- sched.ZMM>, EVEX_V512;
+ MaskOpNode, sched.ZMM>, EVEX_V512;
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
- OpNode128, sched.XMM, "{1to2}", "", i64mem, VK2WM,
+ OpNode128, MaskOpNode128, sched.XMM, "{1to2}",
+ "", i64mem, VK2WM,
(v2f64 (OpNode128 (bc_v4i32
(v2i64
+ (scalar_to_vector (loadi64 addr:$src)))))),
+ (v2f64 (MaskOpNode128 (bc_v4i32
+ (v2i64
(scalar_to_vector (loadi64 addr:$src))))))>,
EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Signed/Unsigned Doubleword to Float
multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode,
- sched.XMM>, EVEX_V128;
+ MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode MaskOpNode,
SDNode OpNodeSAE, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
OpNodeSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
- sched.XMM>, EVEX_V128;
+ MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Doubleword
multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
- sched.XMM>, EVEX_V128;
+ MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Double to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeSAE, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeSAE,
+ X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
OpNodeSAE, sched.ZMM>, EVEX_V512;
}
@@ -7785,50 +7880,50 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
- null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+ null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
VK2WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
- sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+ MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256;
}
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
VK2WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
VK2WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
VK4WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
VK4WM:$mask, f64mem:$src), 0, "att">;
@@ -7836,10 +7931,11 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Double to Signed/Unsigned Doubleword
multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
@@ -7849,48 +7945,48 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
- null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+ null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
VK2WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
- sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+ MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256;
}
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
VK2WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
VK2WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
VK4WM:$mask, f64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
VK4WM:$mask, f64mem:$src), 0, "att">;
@@ -7898,61 +7994,65 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Convert Double to Signed/Unsigned Quardword
multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
- sched.XMM>, EVEX_V128;
+ MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Double to Signed/Unsigned Quardword with truncation
multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
- sched.XMM>, EVEX_V128;
+ MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Signed/Unsigned Quardword to Double
multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode,
- sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
+ MaskOpNode, sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode,
- sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
+ MaskOpNode, sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
}
}
// Convert Float to Signed/Unsigned Quardword
multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
@@ -7960,21 +8060,26 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
(v2i64 (OpNode (bc_v4f32
(v2f64
+ (scalar_to_vector (loadf64 addr:$src)))))),
+ (v2i64 (MaskOpNode (bc_v4f32
+ (v2f64
(scalar_to_vector (loadf64 addr:$src))))))>,
EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Quardword with truncation
multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, sched.ZMM>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
@@ -7982,22 +8087,26 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- sched.XMM, "{1to2}", "", f64mem, VK2WM,
+ MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
(v2i64 (OpNode (bc_v4f32
(v2f64
+ (scalar_to_vector (loadf64 addr:$src)))))),
+ (v2i64 (MaskOpNode (bc_v4f32
+ (v2f64
(scalar_to_vector (loadf64 addr:$src))))))>,
EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
- sched.YMM>, EVEX_V256;
+ MaskOpNode, sched.YMM>, EVEX_V256;
}
}
// Convert Signed/Unsigned Quardword to Float
multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+ SDNode MaskOpNode, SDNode OpNodeRnd,
+ X86SchedWriteWidths sched> {
let Predicates = [HasDQI] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
- sched.ZMM>,
+ MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
}
@@ -8007,152 +8116,159 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag,
- sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
+ null_frag, sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
EVEX_V128, NotEVEX2VEXConvertible;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
- sched.YMM, "{1to4}", "{y}">, EVEX_V256,
+ MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256,
NotEVEX2VEXConvertible;
}
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
VK2WM:$mask, VR128X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
i64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
VK2WM:$mask, i64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to2}}",
(!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
VK2WM:$mask, i64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|"
"$dst {${mask}}, $src}",
(!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, $src}",
(!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
VK4WM:$mask, VR256X:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
i64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
"$dst {${mask}}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
VK4WM:$mask, i64mem:$src), 0, "att">;
- def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+ def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
"$dst {${mask}} {z}, ${src}{1to4}}",
(!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
VK4WM:$mask, i64mem:$src), 0, "att">;
}
-defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, X86any_VSintToFP,
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, sint_to_fp,
+ X86any_VSintToFP, X86VSintToFP,
SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
-defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp,
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, sint_to_fp,
X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
PS, EVEX_CD8<32, CD8VF>;
defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si,
- X86cvttp2siSAE, SchedWriteCvtPS2DQ>,
- XS, EVEX_CD8<32, CD8VF>;
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>;
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si,
- X86cvttp2siSAE, SchedWriteCvtPD2DQ>,
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPD2DQ>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui,
- X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS,
- EVEX_CD8<32, CD8VF>;
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>;
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui,
- X86cvttp2uiSAE, SchedWriteCvtPD2DQ>,
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPD2DQ>,
PS, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp,
- X86any_VUintToFP, SchedWriteCvtDQ2PD>, XS,
- EVEX_CD8<32, CD8VH>;
+ uint_to_fp, X86any_VUintToFP, X86VUintToFP,
+ SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp,
- X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD,
- EVEX_CD8<32, CD8VF>;
+ uint_to_fp, X86VUintToFpRnd,
+ SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>;
-defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, X86cvtp2Int,
X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VF>;
-defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, X86cvtp2Int,
X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD,
VEX_W, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, X86cvtp2UInt,
X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
PS, EVEX_CD8<32, CD8VF>;
-defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, X86cvtp2UInt,
X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
PS, EVEX_CD8<64, CD8VF>;
-defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, X86cvtp2Int,
X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, X86cvtp2Int,
X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
-defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, X86cvtp2UInt,
X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UInt,
X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si,
- X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W,
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si,
- X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD,
+ X86cvttp2si, X86cvttp2siSAE,
+ SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui,
- X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W,
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui,
- X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD,
+ X86cvttp2ui, X86cvttp2uiSAE,
+ SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp,
- X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
- EVEX_CD8<64, CD8VF>;
+ sint_to_fp, X86VSintToFpRnd,
+ SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
- X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
- EVEX_CD8<64, CD8VF>;
+ uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>,
+ VEX_W, XS, EVEX_CD8<64, CD8VF>;
defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp,
- X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS,
- EVEX_CD8<64, CD8VF>;
+ sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
+ VEX_W, PS, EVEX_CD8<64, CD8VF>;
defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp,
- X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD,
- EVEX_CD8<64, CD8VF>;
+ uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>,
+ VEX_W, XD, EVEX_CD8<64, CD8VF>;
let Predicates = [HasVLX] in {
// Special patterns to allow use of X86mcvtp2Int for masking. Instruction
@@ -8275,70 +8391,70 @@ let Predicates = [HasVLX] in {
let Predicates = [HasDQI, HasVLX] in {
def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTPS2QQZ128rm addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- VR128X:$src0)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
(VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- v2i64x_info.ImmAllZerosV)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
(VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTPS2UQQZ128rm addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- VR128X:$src0)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
(VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- v2i64x_info.ImmAllZerosV)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
(VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (X86any_cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2QQZ128rm addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- VR128X:$src0)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
(VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- v2i64x_info.ImmAllZerosV)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
(VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (X86any_cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2UQQZ128rm addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- VR128X:$src0)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
(VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
- v2i64x_info.ImmAllZerosV)),
+ def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+ v2i64x_info.ImmAllZerosV)),
(VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDZ128rm addr:$src)>;
- def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
- VR128X:$src0)),
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
(VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
- v2f64x_info.ImmAllZerosV)),
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
(VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTUDQ2PDZ128rm addr:$src)>;
- def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
- VR128X:$src0)),
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ VR128X:$src0)),
(VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
- def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
- v2f64x_info.ImmAllZerosV)),
+ def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ v2f64x_info.ImmAllZerosV)),
(VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
}
@@ -8408,16 +8524,17 @@ let Predicates = [HasDQI, HasVLX] in {
let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
- X86MemOperand x86memop, PatFrag ld_frag,
+ X86MemOperand x86memop, dag ld_dag,
X86FoldableSchedWrite sched> {
- defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
+ defm rr : AVX512_maskable_split<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
(ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
+ (X86any_cvtph2ps (_src.VT _src.RC:$src)),
(X86cvtph2ps (_src.VT _src.RC:$src))>,
T8PD, Sched<[sched]>;
- defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
+ defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
(ins x86memop:$src), "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT
- (ld_frag addr:$src)))>,
+ (X86any_cvtph2ps (_src.VT ld_dag)),
+ (X86cvtph2ps (_src.VT ld_dag))>,
T8PD, Sched<[sched.Folded]>;
}
@@ -8432,23 +8549,22 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
}
let Predicates = [HasAVX512] in
- defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load,
- WriteCvtPH2PSZ>,
+ defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem,
+ (load addr:$src), WriteCvtPH2PSZ>,
avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
let Predicates = [HasVLX] in {
defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
- load, WriteCvtPH2PSY>, EVEX, EVEX_V256,
+ (load addr:$src), WriteCvtPH2PSY>, EVEX, EVEX_V256,
EVEX_CD8<32, CD8VH>;
defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
- load, WriteCvtPH2PS>, EVEX, EVEX_V128,
+ (bitconvert (v2i64 (X86vzload64 addr:$src))),
+ WriteCvtPH2PS>, EVEX, EVEX_V128,
EVEX_CD8<32, CD8VH>;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
- (VCVTPH2PSZ128rm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+ def : Pat<(v4f32 (X86any_cvtph2ps (v8i16 (bitconvert
(v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
(VCVTPH2PSZ128rm addr:$src)>;
}
@@ -8460,7 +8576,7 @@ let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _dest.RC:$dst,
- (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
+ (X86any_cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
Sched<[RR]>;
let Constraints = "$src0 = $dst" in
def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
@@ -8505,54 +8621,35 @@ let Predicates = [HasAVX512] in {
WriteCvtPS2PHZ, WriteCvtPS2PHZSt>,
avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, WriteCvtPS2PHZ>,
EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
- let Predicates = [HasVLX] in {
- defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
- WriteCvtPS2PHY, WriteCvtPS2PHYSt>,
- EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
- defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
- WriteCvtPS2PH, WriteCvtPS2PHSt>,
- EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
- }
+
+ def : Pat<(store (v16i16 (X86any_cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
+ (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
+}
+
+let Predicates = [HasVLX] in {
+ defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
+ WriteCvtPS2PHY, WriteCvtPS2PHYSt>,
+ EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
+ WriteCvtPS2PH, WriteCvtPS2PHSt>,
+ EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
+ (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
(VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
def : Pat<(store (i64 (extractelt
- (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
+ (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
(VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
- def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
+ def : Pat<(store (v8i16 (X86any_cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
(VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>;
- def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
- (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
-}
-
-// Patterns for matching conversions from float to half-float and vice versa.
-let Predicates = [HasVLX] in {
- // Use MXCSR.RC for rounding instead of explicitly specifying the default
- // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
- // configurations we support (the default). However, falling back to MXCSR is
- // more consistent with other instructions, which are always controlled by it.
- // It's encoded as 0b100.
- def : Pat<(fp_to_f16 FR32X:$src),
- (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (v8i16 (VCVTPS2PHZ128rr
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4))), sub_16bit))>;
-
- def : Pat<(f16_to_fp GR16:$src),
- (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
- (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)))), FR32X)) >;
-
- def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
- (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
- (v8i16 (VCVTPS2PHZ128rr
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >;
}
// Unordered/Ordered scalar fp compare with Sae and set EFLAGS
multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
string OpcodeStr, Domain d,
- X86FoldableSchedWrite sched = WriteFCom> {
+ X86FoldableSchedWrite sched = WriteFComX> {
let hasSideEffects = 0, Uses = [MXCSR] in
def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
@@ -8613,7 +8710,7 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG,
+ (_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -8646,7 +8743,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
- "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
(OpNode (_.VT
(_.BroadcastLdFrag addr:$src)))>,
EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -8701,7 +8798,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>,
+ (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
@@ -8741,7 +8838,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
- "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+ "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
(OpNode (_.VT
(_.BroadcastLdFrag addr:$src)))>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -8811,20 +8908,21 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _>{
let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
- defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm r: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (any_fsqrt _.RC:$src))>, EVEX,
+ (_.VT (any_fsqrt _.RC:$src)),
+ (_.VT (fsqrt _.RC:$src))>, EVEX,
Sched<[sched]>;
- defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm m: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
- (any_fsqrt (_.VT
- (bitconvert (_.LdFrag addr:$src))))>, EVEX,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
- defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (any_fsqrt (_.VT (_.LdFrag addr:$src))),
+ (fsqrt (_.VT (_.LdFrag addr:$src)))>, EVEX,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
+ defm mb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
- "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
- (any_fsqrt (_.VT
- (_.BroadcastLdFrag addr:$src)))>,
+ "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
+ (any_fsqrt (_.VT (_.BroadcastLdFrag addr:$src))),
+ (fsqrt (_.VT (_.BroadcastLdFrag addr:$src)))>,
EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -8879,7 +8977,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(X86fsqrts (_.VT _.RC:$src1),
- _.ScalarIntMemCPat:$src2)>,
+ (_.ScalarIntMemFrags addr:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
let Uses = [MXCSR] in
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -8952,7 +9050,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales _.RC:$src1,
- _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
+ (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3)))>,
Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
@@ -8971,13 +9069,13 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX512] in {
def : Pat<(X86any_VRndScale _.FRC:$src1, timm:$src2),
- (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
+ (_.EltVT (!cast<Instruction>(NAME#r) (_.EltVT (IMPLICIT_DEF)),
_.FRC:$src1, timm:$src2))>;
}
let Predicates = [HasAVX512, OptForSize] in {
def : Pat<(X86any_VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
- (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
+ (_.EltVT (!cast<Instruction>(NAME#m) (_.EltVT (IMPLICIT_DEF)),
addr:$src1, timm:$src2))>;
}
}
@@ -8996,13 +9094,13 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
dag OutMask, Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
- def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
(OpNode (extractelt _.VT:$src2, (iPTR 0))),
(extractelt _.VT:$dst, (iPTR 0))))),
(!cast<Instruction>("V"#OpcPrefix#r_Intk)
_.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>;
- def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+ def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
(OpNode (extractelt _.VT:$src2, (iPTR 0))),
ZeroFP))),
(!cast<Instruction>("V"#OpcPrefix#r_Intkz)
@@ -9026,14 +9124,14 @@ defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
// same order as X86vmtrunc, X86vmtruncs, X86vmtruncus. This allows us to pass
// either to the multiclasses.
def select_trunc : PatFrag<(ops node:$src, node:$src0, node:$mask),
- (vselect node:$mask,
- (trunc node:$src), node:$src0)>;
+ (vselect_mask node:$mask,
+ (trunc node:$src), node:$src0)>;
def select_truncs : PatFrag<(ops node:$src, node:$src0, node:$mask),
- (vselect node:$mask,
- (X86vtruncs node:$src), node:$src0)>;
+ (vselect_mask node:$mask,
+ (X86vtruncs node:$src), node:$src0)>;
def select_truncus : PatFrag<(ops node:$src, node:$src0, node:$mask),
- (vselect node:$mask,
- (X86vtruncus node:$src), node:$src0)>;
+ (vselect_mask node:$mask,
+ (X86vtruncus node:$src), node:$src0)>;
multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDPatternOperator MaskNode,
@@ -9083,12 +9181,12 @@ multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
string Name> {
def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
- (!cast<Instruction>(Name#SrcInfo.ZSuffix##mr)
+ (!cast<Instruction>(Name#SrcInfo.ZSuffix#mr)
addr:$dst, SrcInfo.RC:$src)>;
def : Pat<(mtruncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst,
SrcInfo.KRCWM:$mask),
- (!cast<Instruction>(Name#SrcInfo.ZSuffix##mrk)
+ (!cast<Instruction>(Name#SrcInfo.ZSuffix#mrk)
addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
}
@@ -9548,6 +9646,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
@@ -9558,6 +9658,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
}
@@ -9565,6 +9667,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
let Predicates = [HasAVX512] in {
def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+ def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+ def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
+ (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
}
}
@@ -9586,54 +9692,49 @@ def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))),
// FIXME: Improve scheduling of gather/scatter instructions.
multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86MemOperand memop, PatFrag GatherNode,
- RegisterClass MaskRC = _.KRCWM> {
+ X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
- ExeDomain = _.ExeDomain in
+ ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb),
(ins _.RC:$src1, MaskRC:$mask, memop:$src2),
!strconcat(OpcodeStr#_.Suffix,
"\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
- [(set _.RC:$dst, MaskRC:$mask_wb,
- (GatherNode (_.VT _.RC:$src1), MaskRC:$mask,
- vectoraddr:$src2))]>, EVEX, EVEX_K,
- EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
+ []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
}
multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
- defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
- vy512xmem, mgatherv8i32>, EVEX_V512, VEX_W;
- defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
- vz512mem, mgatherv8i64>, EVEX_V512, VEX_W;
+ defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512,
+ vy512xmem>, EVEX_V512, VEX_W;
+ defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info512,
+ vz512mem>, EVEX_V512, VEX_W;
let Predicates = [HasVLX] in {
- defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
- vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W;
- defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
- vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W;
- defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
- vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W;
- defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
- vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+ defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
+ vx256xmem>, EVEX_V256, VEX_W;
+ defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info256,
+ vy256xmem>, EVEX_V256, VEX_W;
+ defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
+ defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
}
}
multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
- defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
- mgatherv16i32>, EVEX_V512;
- defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem,
- mgatherv8i64>, EVEX_V512;
+ defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+ EVEX_V512;
+ defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+ EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
- vy256xmem, mgatherv8i32>, EVEX_V256;
- defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
- vy128xmem, mgatherv4i64>, EVEX_V256;
- defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
- vx128xmem, mgatherv4i32>, EVEX_V128;
- defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
- vx64xmem, mgatherv2i64, VK2WM>,
- EVEX_V128;
+ defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
+ vy256xmem>, EVEX_V256;
+ defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+ vy128xmem>, EVEX_V256;
+ defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128;
+ defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+ vx64xmem, VK2WM>, EVEX_V128;
}
}
@@ -9645,55 +9746,52 @@ defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q
avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86MemOperand memop, PatFrag ScatterNode,
- RegisterClass MaskRC = _.KRCWM> {
+ X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
-let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain,
+ hasSideEffects = 0 in
def mr : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb),
(ins memop:$dst, MaskRC:$mask, _.RC:$src),
!strconcat(OpcodeStr#_.Suffix,
"\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
- [(set MaskRC:$mask_wb, (ScatterNode (_.VT _.RC:$src),
- MaskRC:$mask, vectoraddr:$dst))]>,
- EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+ []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[WriteStore]>;
}
multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
- defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
- vy512xmem, mscatterv8i32>, EVEX_V512, VEX_W;
- defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
- vz512mem, mscatterv8i64>, EVEX_V512, VEX_W;
+ defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512,
+ vy512xmem>, EVEX_V512, VEX_W;
+ defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info512,
+ vz512mem>, EVEX_V512, VEX_W;
let Predicates = [HasVLX] in {
- defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
- vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W;
- defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
- vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W;
- defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
- vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W;
- defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
- vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+ defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
+ vx256xmem>, EVEX_V256, VEX_W;
+ defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info256,
+ vy256xmem>, EVEX_V256, VEX_W;
+ defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
+ defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+ vx128xmem>, EVEX_V128, VEX_W;
}
}
multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
- defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
- mscatterv16i32>, EVEX_V512;
- defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem,
- mscatterv8i64>, EVEX_V512;
+ defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+ EVEX_V512;
+ defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+ EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
- vy256xmem, mscatterv8i32>, EVEX_V256;
- defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
- vy128xmem, mscatterv4i64>, EVEX_V256;
- defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
- vx128xmem, mscatterv4i32>, EVEX_V128;
- defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
- vx64xmem, mscatterv2i64, VK2WM>,
- EVEX_V128;
+ defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
+ vy256xmem>, EVEX_V256;
+ defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+ vy128xmem>, EVEX_V256;
+ defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
+ vx128xmem>, EVEX_V128;
+ defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+ vx64xmem, VK2WM>, EVEX_V128;
}
}
@@ -9762,13 +9860,9 @@ defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd
multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
- !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
+ !strconcat(OpcodeStr#Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
[(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
-
-// Also need a pattern for anyextend.
-def : Pat<(Vec.VT (anyext Vec.KRC:$src)),
- (!cast<Instruction>(NAME#"rr") Vec.KRC:$src)>;
}
multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
@@ -9842,19 +9936,11 @@ let Predicates = [HasDQI, NoBWI] in {
(VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
def : Pat<(v16i16 (sext (v16i1 VK16:$src))),
(VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
-
- def : Pat<(v16i8 (anyext (v16i1 VK16:$src))),
- (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
- def : Pat<(v16i16 (anyext (v16i1 VK16:$src))),
- (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
}
let Predicates = [HasDQI, NoBWI, HasVLX] in {
def : Pat<(v8i16 (sext (v8i1 VK8:$src))),
(VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
-
- def : Pat<(v8i16 (anyext (v8i1 VK8:$src))),
- (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
}
//===----------------------------------------------------------------------===//
@@ -9885,14 +9971,14 @@ multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix##mrk)
+ (!cast<Instruction>(Name#_.ZSuffix#mrk)
addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix##rrk)
+ (!cast<Instruction>(Name#_.ZSuffix#rrk)
_.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+ (!cast<Instruction>(Name#_.ZSuffix#rrkz)
_.KRCWM:$mask, _.RC:$src)>;
}
@@ -9940,23 +10026,23 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
- (!cast<Instruction>(Name#_.ZSuffix##rmkz)
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz)
_.KRCWM:$mask, addr:$src)>;
def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, _.ImmAllZerosV)),
- (!cast<Instruction>(Name#_.ZSuffix##rmkz)
+ (!cast<Instruction>(Name#_.ZSuffix#rmkz)
_.KRCWM:$mask, addr:$src)>;
def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
(_.VT _.RC:$src0))),
- (!cast<Instruction>(Name#_.ZSuffix##rmk)
+ (!cast<Instruction>(Name#_.ZSuffix#rmk)
_.RC:$src0, _.KRCWM:$mask, addr:$src)>;
def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix##rrk)
+ (!cast<Instruction>(Name#_.ZSuffix#rrk)
_.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
- (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+ (!cast<Instruction>(Name#_.ZSuffix#rrkz)
_.KRCWM:$mask, _.RC:$src)>;
}
@@ -9990,26 +10076,33 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256,
// op(mem_vec,imm)
// op(broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
-multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SDNode MaskOpNode,
+ X86FoldableSchedWrite sched,
+ X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
- defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rri : AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1),
- (i32 timm:$src2))>, Sched<[sched]>;
- defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2",
+ (OpNode (_.VT _.RC:$src1), (i32 timm:$src2)),
+ (MaskOpNode (_.VT _.RC:$src1), (i32 timm:$src2))>,
+ Sched<[sched]>;
+ defm rmi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+ OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 timm:$src2))>,
+ (i32 timm:$src2)),
+ (MaskOpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+ (i32 timm:$src2))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
- defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ defm rmbi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
- "${src1}"##_.BroadcastStr##", $src2",
+ OpcodeStr#_.Suffix, "$src2, ${src1}"#_.BroadcastStr,
+ "${src1}"#_.BroadcastStr#", $src2",
(OpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
- (i32 timm:$src2))>, EVEX_B,
+ (i32 timm:$src2)),
+ (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
+ (i32 timm:$src2))>, EVEX_B,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -10021,7 +10114,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
- OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
+ OpcodeStr#_.Suffix, "$src2, {sae}, $src1",
"$src1, {sae}, $src2",
(OpNode (_.VT _.RC:$src1),
(i32 timm:$src2))>,
@@ -10030,18 +10123,19 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
- SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched,
+ Predicate prd>{
let Predicates = [prd] in {
- defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM,
- _.info512>,
+ defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.ZMM, _.info512>,
avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE,
sched.ZMM, _.info512>, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM,
- _.info128>, EVEX_V128;
- defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM,
- _.info256>, EVEX_V256;
+ defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.XMM, _.info128>, EVEX_V128;
+ defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+ sched.YMM, _.info256>, EVEX_V256;
}
}
@@ -10068,8 +10162,8 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
- OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (_.BroadcastLdFrag addr:$src2)),
(i32 timm:$src3))>, EVEX_B,
@@ -10111,8 +10205,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
let ExeDomain = _.ExeDomain in
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
- OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (_.BroadcastLdFrag addr:$src2)),
(i8 timm:$src3))>, EVEX_B,
@@ -10135,7 +10229,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
- (_.VT _.ScalarIntMemCPat:$src2),
+ (_.ScalarIntMemFrags addr:$src2),
(i32 timm:$src3))>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -10228,24 +10322,26 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
- SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
+ SDNode MaskOpNode, SDNode OpNodeSAE,
+ X86SchedWriteWidths sched, Predicate prd>{
defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
- opcPs, OpNode, OpNodeSAE, sched, prd>,
+ opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
EVEX_CD8<32, CD8VF>;
defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
- opcPd, OpNode, OpNodeSAE, sched, prd>,
+ opcPd, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
EVEX_CD8<64, CD8VF>, VEX_W;
}
defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
- X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>,
- AVX512AIi8Base, EVEX;
+ X86VReduce, X86VReduce, X86VReduceSAE,
+ SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX;
defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
- X86any_VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>,
+ X86any_VRndScale, X86VRndScale, X86VRndScaleSAE,
+ SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
- X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>,
- AVX512AIi8Base, EVEX;
+ X86VGetMant, X86VGetMant, X86VGetMantSAE,
+ SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX;
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
0x50, X86VRange, X86VRangeSAE,
@@ -10302,8 +10398,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
- OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
(_.VT
(bitconvert
(CastInfo.VT
@@ -10391,8 +10487,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
- OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $src3",
+ OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+ "$src1, ${src2}"#_.BroadcastStr#", $src3",
(X86VAlign _.RC:$src1,
(_.VT (_.BroadcastLdFrag addr:$src2)),
(i8 timm:$src3))>, EVEX_B,
@@ -10441,40 +10537,40 @@ def ValigndImm8XForm : SDNodeXForm<timm, [{
multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
X86VectorVTInfo From, X86VectorVTInfo To,
SDNodeXForm ImmXForm> {
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (bitconvert
- (From.VT (OpNode From.RC:$src1, From.RC:$src2,
- timm:$src3))),
- To.RC:$src0)),
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ timm:$src3))),
+ To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
(ImmXForm timm:$src3))>;
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (bitconvert
- (From.VT (OpNode From.RC:$src1, From.RC:$src2,
- timm:$src3))),
- To.ImmAllZerosV)),
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ timm:$src3))),
+ To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
To.RC:$src1, To.RC:$src2,
(ImmXForm timm:$src3))>;
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (bitconvert
- (From.VT (OpNode From.RC:$src1,
- (From.LdFrag addr:$src2),
- timm:$src3))),
- To.RC:$src0)),
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (From.LdFrag addr:$src2),
+ timm:$src3))),
+ To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm timm:$src3))>;
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (bitconvert
- (From.VT (OpNode From.RC:$src1,
- (From.LdFrag addr:$src2),
- timm:$src3))),
- To.ImmAllZerosV)),
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (From.LdFrag addr:$src2),
+ timm:$src3))),
+ To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm timm:$src3))>;
@@ -10491,24 +10587,24 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
(!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
(ImmXForm timm:$src3))>;
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (bitconvert
- (From.VT (OpNode From.RC:$src1,
- (bitconvert
- (To.VT (To.BroadcastLdFrag addr:$src2))),
- timm:$src3))),
- To.RC:$src0)),
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
+ To.RC:$src0)),
(!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm timm:$src3))>;
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (bitconvert
- (From.VT (OpNode From.RC:$src1,
- (bitconvert
- (To.VT (To.BroadcastLdFrag addr:$src2))),
- timm:$src3))),
- To.ImmAllZerosV)),
+ def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (To.BroadcastLdFrag addr:$src2))),
+ timm:$src3))),
+ To.ImmAllZerosV)),
(!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
To.RC:$src1, addr:$src2,
(ImmXForm timm:$src3))>;
@@ -10567,8 +10663,8 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1), OpcodeStr,
- "${src1}"##_.BroadcastStr,
- "${src1}"##_.BroadcastStr,
+ "${src1}"#_.BroadcastStr,
+ "${src1}"#_.BroadcastStr,
(_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>,
EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded]>;
@@ -10751,32 +10847,14 @@ defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>
let Predicates = [HasVLX] in {
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
- (VMOVDDUPZ128rm addr:$src)>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
- (VMOVDDUPZ128rm addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
- (v2f64 VR128X:$src0)),
+def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (v2f64 VR128X:$src0)),
(VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
(v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
- immAllZerosV),
+def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ immAllZerosV),
(VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
- (v2f64 VR128X:$src0)),
- (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
- immAllZerosV),
- (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
-
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
- (v2f64 VR128X:$src0)),
- (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
- immAllZerosV),
- (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -10784,9 +10862,9 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load
//===----------------------------------------------------------------------===//
let Uses = []<Register>, mayRaiseFPException = 0 in {
-defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, X86Unpckh, HasAVX512,
SchedWriteFShuffleSizes, 0, 1>;
-defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, X86Unpckl, HasAVX512,
SchedWriteFShuffleSizes>;
}
@@ -10945,16 +11023,15 @@ defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD,
// AVX-512 - Byte shift Left/Right
//===----------------------------------------------------------------------===//
-// FIXME: The SSE/AVX names are PSLLDQri etc. - should we add the i here as well?
multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
Format MRMm, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _>{
- def rr : AVX512<opc, MRMr,
+ def ri : AVX512<opc, MRMr,
(outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>,
Sched<[sched]>;
- def rm : AVX512<opc, MRMm,
+ def mi : AVX512<opc, MRMm,
(outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst,(_.VT (OpNode
@@ -11106,8 +11183,8 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
- OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
- "$src2, ${src3}"##_.BroadcastStr##", $src4",
+ OpcodeStr, "$src4, ${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr#", $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (_.BroadcastLdFrag addr:$src3)),
@@ -11117,12 +11194,12 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
}// Constraints = "$src1 = $dst"
// Additional patterns for matching passthru operand in other positions.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
@@ -11141,13 +11218,13 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Additional patterns for matching zero masking with loads in other
// positions.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, (i8 timm:$src4)),
_.ImmAllZerosV)),
@@ -11156,31 +11233,31 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Additional patterns for matching masked loads with different
// operand orders.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
(bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode (bitconvert (_.LdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
@@ -11200,14 +11277,14 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Additional patterns for matching zero masking with broadcasts in other
// positions.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode (_.BroadcastLdFrag addr:$src3),
_.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.ImmAllZerosV)),
(!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
_.KRCWM:$mask, _.RC:$src2, addr:$src3,
(VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src1,
(_.BroadcastLdFrag addr:$src3),
_.RC:$src2, (i8 timm:$src4)),
@@ -11218,32 +11295,32 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
// Additional patterns for matching masked broadcasts with different
// operand orders.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3),
_.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode (_.BroadcastLdFrag addr:$src3),
_.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
(_.BroadcastLdFrag addr:$src3),
(i8 timm:$src4)), _.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode _.RC:$src2,
(_.BroadcastLdFrag addr:$src3),
_.RC:$src1, (i8 timm:$src4)),
_.RC:$src1)),
(!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
+ def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
(OpNode (_.BroadcastLdFrag addr:$src3),
_.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
_.RC:$src1)),
@@ -11288,6 +11365,36 @@ let Predicates = [HasVLX] in {
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
(VPTERNLOG132_imm8 timm:$src4))>;
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i8 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
+ (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i8 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
+ (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
(i8 timm:$src4))),
(VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
@@ -11305,6 +11412,66 @@ let Predicates = [HasVLX] in {
(VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
(VPTERNLOG132_imm8 timm:$src4))>;
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i16 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
+ (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i16 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
+ (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v4i32 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v4i32 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v4i32 (X86vpternlog VR128X:$src1,
+ (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v2i64 (X86vpternlog VR128X:$src1, VR128X:$src2,
+ (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v2i64 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v2i64 (X86vpternlog VR128X:$src1,
+ (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+ VR128X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
(i8 timm:$src4))),
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
@@ -11322,6 +11489,36 @@ let Predicates = [HasVLX] in {
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
(VPTERNLOG132_imm8 timm:$src4))>;
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i8 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
+ (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i8 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
+ (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
(i8 timm:$src4))),
(VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
@@ -11338,6 +11535,66 @@ let Predicates = [HasVLX] in {
VR256X:$src2, (i8 timm:$src4))),
(VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
(VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i16 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
+ (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i16 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
+ (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v8i32 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i32 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v8i32 (X86vpternlog VR256X:$src1,
+ (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v4i64 (X86vpternlog VR256X:$src1, VR256X:$src2,
+ (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v4i64 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v4i64 (X86vpternlog VR256X:$src1,
+ (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+ VR256X:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
}
let Predicates = [HasAVX512] in {
@@ -11358,6 +11615,36 @@ let Predicates = [HasAVX512] in {
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
(VPTERNLOG132_imm8 timm:$src4))>;
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v64i8 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
+ (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v64i8 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v64i8 (X86vpternlog VR512:$src1,
+ (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
(i8 timm:$src4))),
(VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
@@ -11371,9 +11658,84 @@ let Predicates = [HasAVX512] in {
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
(VPTERNLOG321_imm8 timm:$src4))>;
def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
- VR512:$src2, (i8 timm:$src4))),
+ VR512:$src2, (i8 timm:$src4))),
(VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
(VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+ (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i16 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1,
+ (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v32i16 (X86vpternlog VR512:$src1,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v16i32 (X86vpternlog VR512:$src1, VR512:$src2,
+ (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v16i32 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v16i32 (X86vpternlog VR512:$src1,
+ (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
+
+ def : Pat<(v8i64 (X86vpternlog VR512:$src1, VR512:$src2,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ timm:$src4)>;
+ def : Pat<(v8i64 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG321_imm8 timm:$src4))>;
+ def : Pat<(v8i64 (X86vpternlog VR512:$src1,
+ (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+ VR512:$src2, (i8 timm:$src4))),
+ (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+ (VPTERNLOG132_imm8 timm:$src4))>;
}
// Patterns to implement vnot using vpternlog instead of creating all ones
@@ -11484,14 +11846,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
- OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT _.RC:$src3),
(i32 timm:$src4))>, Sched<[sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
- OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
@@ -11499,8 +11861,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
- OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
- "$src2, ${src3}"##_.BroadcastStr##", $src4",
+ OpcodeStr#_.Suffix, "$src4, ${src3}"#_.BroadcastStr#", $src2",
+ "$src2, ${src3}"#_.BroadcastStr#", $src4",
(X86VFixupimm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)),
@@ -11516,7 +11878,7 @@ multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
- OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+ OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2",
"$src2, $src3, {sae}, $src4",
(X86VFixupimmSAE (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
@@ -11533,7 +11895,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
- OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(X86VFixupimms (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
@@ -11541,7 +11903,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
let Uses = [MXCSR] in
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
- OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+ OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2",
"$src2, $src3, {sae}, $src4",
(X86VFixupimmSAEs (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
@@ -11550,7 +11912,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
- OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+ OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
(X86VFixupimms (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_src3VT.VT (scalar_to_vector
@@ -11630,8 +11992,9 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
// TODO: Some canonicalization in lowering would simplify the number of
// patterns we have to try to match.
-multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode MoveNode,
- X86VectorVTInfo _, PatLeaf ZeroFP> {
+multiclass AVX512_scalar_math_fp_patterns<SDNode Op, SDNode MaskedOp,
+ string OpcPrefix, SDNode MoveNode,
+ X86VectorVTInfo _, PatLeaf ZeroFP> {
let Predicates = [HasAVX512] in {
// extracted scalar math op with insert via movss
def : Pat<(MoveNode
@@ -11639,79 +12002,79 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
(_.VT (scalar_to_vector
(Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
_.FRC:$src)))),
- (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
+ (!cast<Instruction>("V"#OpcPrefix#"Zrr_Int") _.VT:$dst,
(_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
def : Pat<(MoveNode
(_.VT VR128X:$dst),
(_.VT (scalar_to_vector
(Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
(_.ScalarLdFrag addr:$src))))),
- (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>;
+ (!cast<Instruction>("V"#OpcPrefix#"Zrm_Int") _.VT:$dst, addr:$src)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
(scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op (_.EltVT
- (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src2),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2),
_.FRC:$src0))),
- (!cast<Instruction>("V"#OpcPrefix#Zrr_Intk)
+ (!cast<Instruction>("V"#OpcPrefix#"Zrr_Intk")
(_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
def : Pat<(MoveNode (_.VT VR128X:$src1),
(scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op (_.EltVT
- (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (_.ScalarLdFrag addr:$src2)),
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)),
_.FRC:$src0))),
- (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk)
+ (!cast<Instruction>("V"#OpcPrefix#"Zrm_Intk")
(_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
VK1WM:$mask, _.VT:$src1, addr:$src2)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
(scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op (_.EltVT
- (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- _.FRC:$src2), (_.EltVT ZeroFP)))),
- (!cast<I>("V"#OpcPrefix#Zrr_Intkz)
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ _.FRC:$src2), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#"Zrr_Intkz")
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
def : Pat<(MoveNode (_.VT VR128X:$src1),
(scalar_to_vector
- (X86selects VK1WM:$mask,
- (Op (_.EltVT
- (extractelt (_.VT VR128X:$src1), (iPTR 0))),
- (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
- (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>;
+ (X86selects_mask VK1WM:$mask,
+ (MaskedOp (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#"Zrm_Intkz") VK1WM:$mask, _.VT:$src1, addr:$src2)>;
}
}
-defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
-defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
-defm : AVX512_scalar_math_fp_patterns<fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
-defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
-defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
-defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
-defm : AVX512_scalar_math_fp_patterns<fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
-defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
SDNode Move, X86VectorVTInfo _> {
let Predicates = [HasAVX512] in {
def : Pat<(_.VT (Move _.VT:$dst,
(scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
- (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src)>;
+ (!cast<Instruction>("V"#OpcPrefix#"Zr_Int") _.VT:$dst, _.VT:$src)>;
}
}
-defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
-defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
//===----------------------------------------------------------------------===//
// AES instructions
@@ -11724,13 +12087,13 @@ multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> {
loadv2i64, 0, VR128X, i128mem>,
EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, VEX_WIG;
defm Z256 : AESI_binop_rm_int<Op, OpStr,
- !cast<Intrinsic>(IntPrefix##"_256"),
+ !cast<Intrinsic>(IntPrefix#"_256"),
loadv4i64, 0, VR256X, i256mem>,
EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512, HasVAES] in
defm Z : AESI_binop_rm_int<Op, OpStr,
- !cast<Intrinsic>(IntPrefix##"_512"),
+ !cast<Intrinsic>(IntPrefix#"_512"),
loadv8i64, 0, VR512, i512mem>,
EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_WIG;
}
@@ -11792,8 +12155,8 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
ExeDomain = VTI.ExeDomain in
defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
- "${src3}"##VTI.BroadcastStr##", $src2",
- "$src2, ${src3}"##VTI.BroadcastStr,
+ "${src3}"#VTI.BroadcastStr#", $src2",
+ "$src2, ${src3}"#VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
AVX512FMA3Base, EVEX_B,
@@ -11827,22 +12190,22 @@ multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
}
multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
SDNode OpNode, X86SchedWriteWidths sched> {
- defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, sched,
+ defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched,
avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
- defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, sched,
+ defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched,
avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
- defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, sched,
+ defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched,
avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
}
multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
SDNode OpNode, X86SchedWriteWidths sched> {
- defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", sched,
+ defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix#"w", sched,
avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
VEX_W, EVEX_CD8<16, CD8VF>;
- defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp,
+ defm D : avx512_common_3Op_imm8<Prefix#"d", avx512vl_i32_info, dqOp,
OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
- defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode,
+ defm Q : avx512_common_3Op_imm8<Prefix#"q", avx512vl_i64_info, dqOp, OpNode,
sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
}
@@ -11890,8 +12253,8 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.ScalarMemOp:$src3),
- OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
- "$src2, ${src3}"##VTI.BroadcastStr,
+ OpStr, "${src3}"#VTI.BroadcastStr#", $src2",
+ "$src2, ${src3}"#VTI.BroadcastStr,
(OpNode VTI.RC:$src1, VTI.RC:$src2,
(VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
@@ -12027,8 +12390,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
let ExeDomain = VTI.ExeDomain in
defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
- OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1",
- "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
+ OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1",
+ "$src1, ${src2}"#BcstVTI.BroadcastStr#", $src3",
(OpNode (VTI.VT VTI.RC:$src1),
(bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))),
(i8 timm:$src3))>, EVEX_B,
@@ -12184,41 +12547,44 @@ multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr,
}
}
+let ExeDomain = SSEPackedSingle in
defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
- SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF
+ SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF
avx512vl_f32_info, avx512vl_i16_info,
X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
// Truncate Float to BFloat16
multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
+ let ExeDomain = SSEPackedSingle in {
let Predicates = [HasBF16], Uses = []<Register>, mayRaiseFPException = 0 in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info,
- X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
+ X86cvtneps2bf16, X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasBF16, HasVLX] in {
let Uses = []<Register>, mayRaiseFPException = 0 in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info,
- null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
+ null_frag, null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
VK4WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info,
- X86cvtneps2bf16,
+ X86cvtneps2bf16, X86cvtneps2bf16,
sched.YMM, "{1to8}", "{y}">, EVEX_V256;
}
+ } // Predicates = [HasBF16, HasVLX]
+ } // ExeDomain = SSEPackedSingle
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
- VR128X:$src), 0>;
- def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst,
- f128mem:$src), 0, "intel">;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
- VR256X:$src), 0>;
- def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
- (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst,
- f256mem:$src), 0, "intel">;
- }
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+ VR128X:$src), 0>;
+ def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst,
+ f128mem:$src), 0, "intel">;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+ VR256X:$src), 0>;
+ def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+ (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst,
+ f256mem:$src), 0, "intel">;
}
defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
@@ -12262,25 +12628,24 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, X86VectorVTInfo src_v> {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src2, _.RC:$src3),
+ (ins src_v.RC:$src2, src_v.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+ (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, src_v.RC:$src3))>,
EVEX_4V, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src2, _.MemOp:$src3),
+ (ins src_v.RC:$src2, src_v.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2,
- (src_v.VT (bitconvert
- (src_v.LdFrag addr:$src3)))))>, EVEX_4V,
+ (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
+ (src_v.LdFrag addr:$src3)))>, EVEX_4V,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src2, _.ScalarMemOp:$src3),
+ (ins src_v.RC:$src2, src_v.ScalarMemOp:$src3),
OpcodeStr,
!strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr),
- (_.VT (OpNode _.RC:$src1, _.RC:$src2,
+ (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
(src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -12302,6 +12667,7 @@ multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
+let ExeDomain = SSEPackedSingle in
defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
avx512vl_f32_info, avx512vl_i32_info,
HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
index 1e399a894490..f7f22285bd15 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -605,16 +605,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
- Imm8, i8imm, relocImm8_su, i8imm, invalid_node,
+ Imm8, i8imm, imm_su, i8imm, invalid_node,
0, OpSizeFixed, 0>;
def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
- Imm16, i16imm, relocImm16_su, i16i8imm, i16immSExt8_su,
+ Imm16, i16imm, imm_su, i16i8imm, i16immSExt8_su,
1, OpSize16, 0>;
def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
- Imm32, i32imm, relocImm32_su, i32i8imm, i32immSExt8_su,
+ Imm32, i32imm, imm_su, i32i8imm, i32immSExt8_su,
1, OpSize32, 0>;
def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
- Imm32S, i64i32imm, i64relocImmSExt32_su, i64i8imm, i64immSExt8_su,
+ Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
1, OpSizeFixed, 1>;
/// ITy - This instruction base class takes the type info for the instruction.
@@ -1217,6 +1217,146 @@ def : Pat<(store (X86adc_flag GR64:$src, (loadi64 addr:$dst), EFLAGS),
addr:$dst),
(ADC64mr addr:$dst, GR64:$src)>;
+// Patterns for basic arithmetic ops with relocImm for the immediate field.
+multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode> {
+ def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2),
+ (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2),
+ (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
+ (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+ (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+ def : Pat<(store (OpNode (load addr:$dst), relocImm8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i16relocImmSExt8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), relocImm16_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i32relocImmSExt8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), relocImm32_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt8_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt32_su:$src), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>;
+}
+
+multiclass ArithBinOp_RFF_relocImm_Pats<SDNode OpNodeFlag> {
+ def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2, EFLAGS),
+ (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+ def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i16relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm16_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i32relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm32_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>;
+ def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt32_su:$src, EFLAGS), addr:$dst),
+ (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>;
+}
+
+multiclass ArithBinOp_F_relocImm_Pats<SDNode OpNodeFlag> {
+ def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2),
+ (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2),
+ (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
+ (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+ (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+ def : Pat<(OpNodeFlag (loadi8 addr:$src1), relocImm8_su:$src2),
+ (!cast<Instruction>(NAME#"8mi") addr:$src1, relocImm8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi16 addr:$src1), i16relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"16mi8") addr:$src1, i16relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi16 addr:$src1), relocImm16_su:$src2),
+ (!cast<Instruction>(NAME#"16mi") addr:$src1, relocImm16_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi32 addr:$src1), i32relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"32mi8") addr:$src1, i32relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi32 addr:$src1), relocImm32_su:$src2),
+ (!cast<Instruction>(NAME#"32mi") addr:$src1, relocImm32_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt8_su:$src2),
+ (!cast<Instruction>(NAME#"64mi8") addr:$src1, i64relocImmSExt8_su:$src2)>;
+ def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
+ (!cast<Instruction>(NAME#"64mi32") addr:$src1, i64relocImmSExt32_su:$src2)>;
+}
+
+defm AND : ArithBinOp_RF_relocImm_Pats<X86and_flag, and>;
+defm OR : ArithBinOp_RF_relocImm_Pats<X86or_flag, or>;
+defm XOR : ArithBinOp_RF_relocImm_Pats<X86xor_flag, xor>;
+defm ADD : ArithBinOp_RF_relocImm_Pats<X86add_flag, add>;
+defm SUB : ArithBinOp_RF_relocImm_Pats<X86sub_flag, sub>;
+
+defm ADC : ArithBinOp_RFF_relocImm_Pats<X86adc_flag>;
+defm SBB : ArithBinOp_RFF_relocImm_Pats<X86sbb_flag>;
+
+defm CMP : ArithBinOp_F_relocImm_Pats<X86cmp>;
+
+// ADC is commutable, but we can't indicate that to tablegen. So manually
+// reverse the operands.
+def : Pat<(X86adc_flag GR8:$src1, relocImm8_su:$src2, EFLAGS),
+ (ADC8ri relocImm8_su:$src2, GR8:$src1)>;
+def : Pat<(X86adc_flag i16relocImmSExt8_su:$src2, GR16:$src1, EFLAGS),
+ (ADC16ri8 GR16:$src1, i16relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag relocImm16_su:$src2, GR16:$src1, EFLAGS),
+ (ADC16ri GR16:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86adc_flag i32relocImmSExt8_su:$src2, GR32:$src1, EFLAGS),
+ (ADC32ri8 GR32:$src1, i32relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag relocImm32_su:$src2, GR32:$src1, EFLAGS),
+ (ADC32ri GR32:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86adc_flag i64relocImmSExt8_su:$src2, GR64:$src1, EFLAGS),
+ (ADC64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag i64relocImmSExt32_su:$src2, GR64:$src1, EFLAGS),
+ (ADC64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+def : Pat<(store (X86adc_flag relocImm8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC8mi addr:$dst, relocImm8_su:$src)>;
+def : Pat<(store (X86adc_flag i16relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC16mi8 addr:$dst, i16relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag relocImm16_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC16mi addr:$dst, relocImm16_su:$src)>;
+def : Pat<(store (X86adc_flag i32relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC32mi8 addr:$dst, i32relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag relocImm32_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC32mi addr:$dst, relocImm32_su:$src)>;
+def : Pat<(store (X86adc_flag i64relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC64mi8 addr:$dst, i64relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag i64relocImmSExt32_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+ (ADC64mi32 addr:$dst, i64relocImmSExt32_su:$src)>;
+
//===----------------------------------------------------------------------===//
// Semantically, test instructions are similar like AND, except they don't
// generate a result. From an encoding perspective, they are very different:
@@ -1247,7 +1387,6 @@ let isCompare = 1 in {
def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
- let Predicates = [In64BitMode] in
def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
@@ -1267,6 +1406,25 @@ let isCompare = 1 in {
"{$src, %rax|rax, $src}">;
} // isCompare
+// Patterns to match a relocImm into the immediate field.
+def : Pat<(X86testpat GR8:$src1, relocImm8_su:$src2),
+ (TEST8ri GR8:$src1, relocImm8_su:$src2)>;
+def : Pat<(X86testpat GR16:$src1, relocImm16_su:$src2),
+ (TEST16ri GR16:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86testpat GR32:$src1, relocImm32_su:$src2),
+ (TEST32ri GR32:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86testpat GR64:$src1, i64relocImmSExt32_su:$src2),
+ (TEST64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+def : Pat<(X86testpat (loadi8 addr:$src1), relocImm8_su:$src2),
+ (TEST8mi addr:$src1, relocImm8_su:$src2)>;
+def : Pat<(X86testpat (loadi16 addr:$src1), relocImm16_su:$src2),
+ (TEST16mi addr:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86testpat (loadi32 addr:$src1), relocImm32_su:$src2),
+ (TEST32mi addr:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86testpat (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
+ (TEST64mi32 addr:$src1, i64relocImmSExt32_su:$src2)>;
+
//===----------------------------------------------------------------------===//
// ANDN Instruction
//
@@ -1306,7 +1464,6 @@ let Predicates = [HasBMI], AddedComplexity = -6 in {
multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
X86FoldableSchedWrite sched> {
let hasSideEffects = 0 in {
- let isCommutable = 1 in
def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
[]>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
@@ -1314,7 +1471,17 @@ let hasSideEffects = 0 in {
let mayLoad = 1 in
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+
[]>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
+
+ // Pseudo instructions to be used when the low result isn't used. The
+ // instruction is defined to keep the high if both destinations are the same.
+ def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
+ []>, Sched<[sched]>;
+
+ let mayLoad = 1 in
+ def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
+ []>, Sched<[sched.Folded]>;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h
index aa45e9b191c1..07079ef87fd4 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -207,7 +207,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
Flags |= MachineMemOperand::MOStore;
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
return addOffset(MIB.addFrameIndex(FI), Offset)
.addMemOperand(MMO);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
index 1fdac104cb73..4df93fb2ed60 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -111,8 +111,30 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
[(set GR64:$dst,
(X86SegAlloca GR64:$size))]>,
Requires<[In64BitMode]>;
+
+// To protect against stack clash, dynamic allocation should perform a memory
+// probe at each page.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def PROBED_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+ "# variable sized alloca with probing",
+ [(set GR32:$dst,
+ (X86ProbedAlloca GR32:$size))]>,
+ Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def PROBED_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+ "# variable sized alloca with probing",
+ [(set GR64:$dst,
+ (X86ProbedAlloca GR64:$size))]>,
+ Requires<[In64BitMode]>;
}
+let hasNoSchedulingInfo = 1 in
+def STACKALLOC_W_PROBING : I<0, Pseudo, (outs), (ins i64imm:$stacksize),
+ "# fixed size alloca with probing",
+ []>;
+
// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
// targets. These calls are needed to probe the stack when allocating more than
// 4k bytes in one go. Touching the stack at 4K increments is necessary to
@@ -177,18 +199,6 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
[(catchret bb:$dst, bb:$from)]>;
}
-let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
- usesCustomInserter = 1 in
-def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
-
-// This instruction is responsible for re-establishing stack pointers after an
-// exception has been caught and we are rejoining normal control flow in the
-// parent function or funclet. It generally sets ESP and EBP, and optionally
-// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
-// elsewhere.
-let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
-def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
-
let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
usesCustomInserter = 1 in {
def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
@@ -308,69 +318,26 @@ def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
// Materialize i64 constant where top 32-bits are zero. This could theoretically
// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
// that would make it more difficult to rematerialize.
-let isReMaterializable = 1, isAsCheapAsAMove = 1,
- isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
-def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "", []>;
-
-// This 64-bit pseudo-move can be used for both a 64-bit constant that is
-// actually the zero-extension of a 32-bit constant and for labels in the
-// x86-64 small code model.
-def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
-
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
+ isPseudo = 1, SchedRW = [WriteMove] in
+def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "",
+ [(set GR64:$dst, i64immZExt32:$src)]>;
+
+// This 64-bit pseudo-move can also be used for labels in the x86-64 small code
+// model.
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [X86Wrapper]>;
def : Pat<(i64 mov64imm32:$src), (MOV32ri64 mov64imm32:$src)>;
// Use sbb to materialize carry bit.
-let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteADC],
+ hasSideEffects = 0 in {
// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
// However, Pat<> can't replicate the destination reg into the inputs of the
// result.
-def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
- [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
- [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
- [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", []>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", []>;
} // isCodeGenOnly
-
-def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C16r)>;
-def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C64r)>;
-
-def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C16r)>;
-def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C64r)>;
-
-// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
-// will be eliminated and that the sbb can be extended up to a wider type. When
-// this happens, it is great. However, if we are left with an 8-bit sbb and an
-// and, we might as well just match it as a setb.
-def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
- (SETCCr (i8 2))>;
-
-// Patterns to give priority when both inputs are zero so that we don't use
-// an immediate for the RHS.
-// TODO: Should we use a 32-bit sbb for 8/16 to push the extract_subreg out?
-def : Pat<(X86sbb_flag (i8 0), (i8 0), EFLAGS),
- (SBB8rr (EXTRACT_SUBREG (MOV32r0), sub_8bit),
- (EXTRACT_SUBREG (MOV32r0), sub_8bit))>;
-def : Pat<(X86sbb_flag (i16 0), (i16 0), EFLAGS),
- (SBB16rr (EXTRACT_SUBREG (MOV32r0), sub_16bit),
- (EXTRACT_SUBREG (MOV32r0), sub_16bit))>;
-def : Pat<(X86sbb_flag (i32 0), (i32 0), EFLAGS),
- (SBB32rr (MOV32r0), (MOV32r0))>;
-def : Pat<(X86sbb_flag (i64 0), (i64 0), EFLAGS),
- (SBB64rr (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit),
- (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit))>;
-
//===----------------------------------------------------------------------===//
// String Pseudo Instructions
//
@@ -568,10 +535,13 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
- let Predicates = [NoAVX512] in {
+ let Predicates = [HasMMX] in
+ defm _VR64 : CMOVrr_PSEUDO<VR64, x86mmx>;
+
+ let Predicates = [HasSSE1,NoAVX512] in
defm _FR32 : CMOVrr_PSEUDO<FR32, f32>;
+ let Predicates = [HasSSE2,NoAVX512] in
defm _FR64 : CMOVrr_PSEUDO<FR64, f64>;
- }
let Predicates = [HasAVX512] in {
defm _FR32X : CMOVrr_PSEUDO<FR32X, f32>;
defm _FR64X : CMOVrr_PSEUDO<FR64X, f64>;
@@ -585,6 +555,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
defm _VR256X : CMOVrr_PSEUDO<VR256X, v4i64>;
}
defm _VR512 : CMOVrr_PSEUDO<VR512, v8i64>;
+ defm _VK1 : CMOVrr_PSEUDO<VK1, v1i1>;
defm _VK2 : CMOVrr_PSEUDO<VK2, v2i1>;
defm _VK4 : CMOVrr_PSEUDO<VK4, v4i1>;
defm _VK8 : CMOVrr_PSEUDO<VK8, v8i1>;
@@ -880,7 +851,7 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
// it. In other words, the register will not fix the clobbering of
// RBX that will happen when setting the arguments for the instrucion.
//
-// Unlike the actual related instuction, we mark that this one
+// Unlike the actual related instruction, we mark that this one
// defines EBX (instead of using EBX).
// The rationale is that we will define RBX during the expansion of
// the pseudo. The argument feeding EBX is ebx_input.
@@ -1815,21 +1786,24 @@ multiclass MaskedRotateAmountPats<SDNode frag, string name> {
defm : MaskedRotateAmountPats<rotl, "ROL">;
defm : MaskedRotateAmountPats<rotr, "ROR">;
-// Double shift amount is implicitly masked.
-multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
- // (shift x (and y, 31)) ==> (shift x, y)
- def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)),
- (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
- def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)),
- (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
-
- // (shift x (and y, 63)) ==> (shift x, y)
- def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
- (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
-}
-
-defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
-defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
+// Double "funnel" shift amount is implicitly masked.
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
+def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)),
+ (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)),
+ (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)),
+ (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)),
+ (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+// (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)),
+ (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
+ (SHRD64rrCL GR64:$src1, GR64:$src2)>;
let Predicates = [HasBMI2] in {
let AddedComplexity = 1 in {
@@ -1919,15 +1893,6 @@ defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
-
-// (anyext (setcc_carry)) -> (setcc_carry)
-def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C16r)>;
-def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
- (SETB_C32r)>;
-
//===----------------------------------------------------------------------===//
// EFLAGS-defining Patterns
//===----------------------------------------------------------------------===//
@@ -1999,10 +1964,6 @@ def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
-// sub reg, relocImm
-def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
- (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
-
// mul reg, reg
def : Pat<(mul GR16:$src1, GR16:$src2),
(IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
index 1842dc19ec2e..4f7867744017 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrControl.td
@@ -193,14 +193,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
"ljmp{l}\t$seg, $off", []>,
OpSize32, Sched<[WriteJump]>;
}
- def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
- "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;
-
- let AsmVariantName = "att" in
- def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
- "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
- def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
- "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+ let mayLoad = 1 in {
+ def FARJMP64m : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;
+
+ let AsmVariantName = "att" in
+ def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+ def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+ "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+ }
}
// Loop instructions
@@ -275,10 +277,12 @@ let isCall = 1 in
OpSize32, Sched<[WriteJump]>;
}
- def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
- "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
- def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
- "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+ let mayLoad = 1 in {
+ def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+ def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+ }
}
@@ -351,7 +355,8 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK;
}
- def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+ let mayLoad = 1 in
+ def FARCALL64m : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
"lcall{q}\t{*}$dst", []>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
index 9e43a532a3f8..4dbd6bb8cd7e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA.td
@@ -126,7 +126,7 @@ let ExeDomain = SSEPackedSingle in {
loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32,
+ loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
SchedWriteFMA>;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
@@ -141,7 +141,7 @@ let ExeDomain = SSEPackedDouble in {
loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
- loadv2f64, loadv4f64, X86Fmsub, v2f64,
+ loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmaddsub,
@@ -154,19 +154,19 @@ let ExeDomain = SSEPackedDouble in {
// Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>;
+ loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
- loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>;
+ loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>;
}
let ExeDomain = SSEPackedDouble in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
- loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+ loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
}
// All source register operands of FMA opcodes defined in fma3s_rm multiclass
-// can be commuted. In many cases such commute transformation requres an opcode
+// can be commuted. In many cases such commute transformation requires an opcode
// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
// would require an opcode change to FMA*231:
// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
@@ -283,7 +283,7 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
[]>, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
}
-// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// The FMA 213 form is created for lowering of scalar FMA intrinsics
// to machine instructions.
// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
// of FMA 213 form.
@@ -321,12 +321,12 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub,
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd,
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd,
SchedWriteFMA.Scl>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub,
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
SchedWriteFMA.Scl>, VEX_LIG;
multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
@@ -373,14 +373,14 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
}
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
//===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
@@ -542,26 +542,26 @@ let ExeDomain = SSEPackedSingle in {
SchedWriteFMA.Scl>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
- defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32,
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
SchedWriteFMA.Scl>,
fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
- X86Fnmadd, loadf32, SchedWriteFMA.Scl>,
+ X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
- X86Fnmsub, loadf32, SchedWriteFMA.Scl>,
+ X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
- defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -575,26 +575,26 @@ let ExeDomain = SSEPackedDouble in {
SchedWriteFMA.Scl>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
- defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64,
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
SchedWriteFMA.Scl>,
fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
- X86Fnmadd, loadf64, SchedWriteFMA.Scl>,
+ X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
- X86Fnmsub, loadf64, SchedWriteFMA.Scl>,
+ X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
- defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -630,11 +630,11 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name,
}
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 25bbdddb7a21..6d803e931b68 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -116,11 +116,8 @@ static void verifyTables() {
#ifndef NDEBUG
static std::atomic<bool> TableChecked(false);
if (!TableChecked.load(std::memory_order_relaxed)) {
- assert(std::is_sorted(std::begin(Groups), std::end(Groups)) &&
- std::is_sorted(std::begin(RoundGroups), std::end(RoundGroups)) &&
- std::is_sorted(std::begin(BroadcastGroups),
- std::end(BroadcastGroups)) &&
- "FMA3 tables not sorted!");
+ assert(llvm::is_sorted(Groups) && llvm::is_sorted(RoundGroups) &&
+ llvm::is_sorted(BroadcastGroups) && "FMA3 tables not sorted!");
TableChecked.store(true, std::memory_order_relaxed);
}
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h
index 7fa6f5917862..ce0a7cc7f82e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -14,11 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
-#include "X86.h"
-#include "llvm/ADT/DenseMap.h"
-#include <cassert>
#include <cstdint>
-#include <set>
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
index 1830262205c6..67dcb8d00ea5 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -22,24 +22,17 @@ def SDTX86Fst : SDTypeProfile<0, 2, [SDTCisFP<0>,
SDTCisPtrTy<1>]>;
def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
-def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
- [SDNPHasChain, SDNPOptInGlue, SDNPMayStore,
- SDNPMemOperand]>;
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
- [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
- SDNPMemOperand]>;
def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist,
- [SDNPHasChain, SDNPOptInGlue, SDNPMayStore,
- SDNPMemOperand]>;
-def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
@@ -79,8 +72,9 @@ def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
}]>;
-def X86fildflag64 : PatFrag<(ops node:$ptr), (X86fildflag node:$ptr), [{
- return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+def X86fist32 : PatFrag<(ops node:$val, node:$ptr),
+ (X86fist node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
}]>;
def X86fist64 : PatFrag<(ops node:$val, node:$ptr),
@@ -292,7 +286,7 @@ defm MUL : FPBinary_rr<any_fmul>;
defm DIV : FPBinary_rr<any_fdiv>;
}
-// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+// Sets the scheduling resources for the actual NAME#_F<size>m definitions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<any_fadd, MRM0m, "add">;
defm SUB : FPBinary<any_fsub, MRM4m, "sub">;
@@ -381,7 +375,8 @@ def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
-let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1 in {
+let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1,
+ mayLoad = 1 in {
def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
@@ -396,21 +391,22 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
} // SchedRW
let SchedRW = [WriteMicrocoded] in {
-let Defs = [FPSW, FPCW] in {
+let Defs = [FPSW, FPCW], mayLoad = 1 in {
def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
}
-let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW] in {
+let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in {
def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
}
-let Uses = [FPSW] in
+let Uses = [FPSW], mayStore = 1 in
def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
+let mayLoad = 1 in
def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
-let Uses = [FPCW] ,mayRaiseFPException = 1 in
+let Uses = [FPCW] ,mayRaiseFPException = 1, mayStore = 1 in
def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
} // SchedRW
@@ -534,14 +530,20 @@ def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
let mayStore = 1, hasSideEffects = 0 in {
def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
-def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
-def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fist32 RFP32:$src, addr:$op)]>;
+def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fist64 RFP32:$src, addr:$op)]>;
def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
-def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
-def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fist32 RFP64:$src, addr:$op)]>;
+def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fist64 RFP64:$src, addr:$op)]>;
def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
-def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
-def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+ [(X86fist32 RFP80:$src, addr:$op)]>;
+def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+ [(X86fist64 RFP80:$src, addr:$op)]>;
} // mayStore
} // SchedRW, Uses = [FPCW]
@@ -601,6 +603,7 @@ let SchedRW = [WriteMove], Uses = [FPCW] in {
def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">;
def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">;
def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">;
+let mayRaiseFPException = 0 in
def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">;
}
@@ -620,13 +623,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
[(set RFP80:$dst, fpimm1)]>;
}
-let SchedRW = [WriteFLD0], Uses = [FPCW] in
+let SchedRW = [WriteFLD0], Uses = [FPCW], mayRaiseFPException = 0 in
def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
-let SchedRW = [WriteFLD1], Uses = [FPCW] in
+let SchedRW = [WriteFLD1], Uses = [FPCW], mayRaiseFPException = 0 in
def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
-let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW], mayRaiseFPException = 0 in {
def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
@@ -635,25 +638,19 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
} // SchedRW
// Floating point compares.
-let SchedRW = [WriteFCom], Uses = [FPCW] in {
-def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
- [(set FPSW, (trunc (X86any_fcmp RFP32:$lhs, RFP32:$rhs)))]>;
-def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
- [(set FPSW, (trunc (X86any_fcmp RFP64:$lhs, RFP64:$rhs)))]>;
-def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
- [(set FPSW, (trunc (X86any_fcmp RFP80:$lhs, RFP80:$rhs)))]>;
-def COM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
- [(set FPSW, (trunc (X86strict_fcmps RFP32:$lhs, RFP32:$rhs)))]>;
-def COM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
- [(set FPSW, (trunc (X86strict_fcmps RFP64:$lhs, RFP64:$rhs)))]>;
-def COM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
- [(set FPSW, (trunc (X86strict_fcmps RFP80:$lhs, RFP80:$rhs)))]>;
+let SchedRW = [WriteFCom], Uses = [FPCW], hasSideEffects = 0 in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>;
+def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>;
+def COM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>;
+def COM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>;
+def COM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>;
} // SchedRW
} // mayRaiseFPException = 1
let SchedRW = [WriteFCom], mayRaiseFPException = 1 in {
// CC = ST(0) cmp ST(i)
-let Defs = [EFLAGS, FPCW], Uses = [FPCW] in {
+let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
[(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>,
Requires<[FPStackf32, HasCMov]>;
@@ -698,10 +695,9 @@ def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg),
// Floating point flag ops.
let SchedRW = [WriteALU] in {
-let Defs = [AX, FPSW], Uses = [FPSW] in
+let Defs = [AX, FPSW], Uses = [FPSW], hasSideEffects = 0 in
def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
- (outs), (ins), "fnstsw\t{%ax|ax}",
- [(set AX, (X86fp_stsw FPSW))]>;
+ (outs), (ins), "fnstsw\t{%ax|ax}", []>;
let Defs = [FPSW], Uses = [FPCW] in
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
(outs), (ins i16mem:$dst), "fnstcw\t$dst",
@@ -754,20 +750,20 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
let Uses = [FPSW, FPCW] in {
def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
- "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB,
+ "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, PS,
Requires<[HasFXSR]>;
def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
"fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
- TB, Requires<[HasFXSR, In64BitMode]>;
+ PS, Requires<[HasFXSR, In64BitMode]>;
} // Uses = [FPSW, FPCW]
let Defs = [FPSW, FPCW] in {
def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
"fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
- TB, Requires<[HasFXSR]>;
+ PS, Requires<[HasFXSR]>;
def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
"fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
- TB, Requires<[HasFXSR, In64BitMode]>;
+ PS, Requires<[HasFXSR, In64BitMode]>;
} // Defs = [FPSW, FPCW]
} // SchedRW
@@ -799,13 +795,6 @@ def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
-// Used to conv. i64 to f64 since there isn't a SSE version.
-def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m64 addr:$src)>;
-
-// Used to conv. between f80 and i64 for i64 atomic loads.
-def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m80 addr:$src)>;
-def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>;
-
// FP extensions map onto simple pseudo-value conversions if they are to/from
// the FP stack.
def : Pat<(f64 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index f3b286e0375c..e16382e956c5 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -486,7 +486,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
{ X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
{ X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
+ { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 },
{ X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
{ X86::CVTSD2SIrr_Int, X86::CVTSD2SIrm_Int, TB_NO_REVERSE },
{ X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
{ X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
@@ -494,7 +496,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::CVTSI642SDrr, X86::CVTSI642SDrm, 0 },
{ X86::CVTSI642SSrr, X86::CVTSI642SSrm, 0 },
{ X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
{ X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
{ X86::CVTSS2SIrr_Int, X86::CVTSS2SIrm_Int, TB_NO_REVERSE },
{ X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
@@ -627,18 +631,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::UCOMISSrr_Int, X86::UCOMISSrm_Int, TB_NO_REVERSE },
{ X86::VAESIMCrr, X86::VAESIMCrm, 0 },
{ X86::VAESKEYGENASSIST128rr,X86::VAESKEYGENASSIST128rm,0 },
- { X86::VBROADCASTF32X2Z256r, X86::VBROADCASTF32X2Z256m, TB_NO_REVERSE },
- { X86::VBROADCASTF32X2Zr, X86::VBROADCASTF32X2Zm, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Z128r, X86::VBROADCASTI32X2Z128m, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Z256r, X86::VBROADCASTI32X2Z256m, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Zr, X86::VBROADCASTI32X2Zm, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Z256rr,X86::VBROADCASTF32X2Z256rm,TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrr, X86::VBROADCASTF32X2Zrm, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rr,X86::VBROADCASTI32X2Z128rm,TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rr,X86::VBROADCASTI32X2Z256rm,TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrr, X86::VBROADCASTI32X2Zrm, TB_NO_REVERSE },
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rr, X86::VBROADCASTSDZ256rm, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrr, X86::VBROADCASTSDZrm, TB_NO_REVERSE },
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
- { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rr, X86::VBROADCASTSSZ128rm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rr, X86::VBROADCASTSSZ256rm, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrr, X86::VBROADCASTSSZrm, TB_NO_REVERSE },
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
{ X86::VCOMISDZrr, X86::VCOMISDZrm, 0 },
{ X86::VCOMISDZrr_Int, X86::VCOMISDZrm_Int, TB_NO_REVERSE },
@@ -710,15 +714,23 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VCVTQQ2PSZ128rr, X86::VCVTQQ2PSZ128rm, 0 },
{ X86::VCVTQQ2PSZ256rr, X86::VCVTQQ2PSZ256rm, 0 },
{ X86::VCVTQQ2PSZrr, X86::VCVTQQ2PSZrm, 0 },
+ { X86::VCVTSD2SI64Zrr, X86::VCVTSD2SI64Zrm, 0 },
{ X86::VCVTSD2SI64Zrr_Int, X86::VCVTSD2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 },
{ X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SIZrr, X86::VCVTSD2SIZrm, 0 },
{ X86::VCVTSD2SIZrr_Int, X86::VCVTSD2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
{ X86::VCVTSD2SIrr_Int, X86::VCVTSD2SIrm_Int, TB_NO_REVERSE },
{ X86::VCVTSD2USI64Zrr_Int, X86::VCVTSD2USI64Zrm_Int, TB_NO_REVERSE },
{ X86::VCVTSD2USIZrr_Int, X86::VCVTSD2USIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64Zrr, X86::VCVTSS2SI64Zrm, 0 },
{ X86::VCVTSS2SI64Zrr_Int, X86::VCVTSS2SI64Zrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
{ X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SIZrr, X86::VCVTSS2SIZrm, 0 },
{ X86::VCVTSS2SIZrr_Int, X86::VCVTSS2SIZrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
{ X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int, TB_NO_REVERSE },
{ X86::VCVTSS2USI64Zrr_Int, X86::VCVTSS2USI64Zrm_Int, TB_NO_REVERSE },
{ X86::VCVTSS2USIZrr_Int, X86::VCVTSS2USIZrm_Int, TB_NO_REVERSE },
@@ -906,24 +918,24 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VPABSWZrr, X86::VPABSWZrm, 0 },
{ X86::VPABSWrr, X86::VPABSWrm, 0 },
{ X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTBZ128r, X86::VPBROADCASTBZ128m, TB_NO_REVERSE },
- { X86::VPBROADCASTBZ256r, X86::VPBROADCASTBZ256m, TB_NO_REVERSE },
- { X86::VPBROADCASTBZr, X86::VPBROADCASTBZm, TB_NO_REVERSE },
- { X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ128rr, X86::VPBROADCASTBZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rr, X86::VPBROADCASTBZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrr, X86::VPBROADCASTBZrm, TB_NO_REVERSE },
+ { X86::VPBROADCASTBrr , X86::VPBROADCASTBrm, TB_NO_REVERSE },
{ X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTDZ128r, X86::VPBROADCASTDZ128m, TB_NO_REVERSE },
- { X86::VPBROADCASTDZ256r, X86::VPBROADCASTDZ256m, TB_NO_REVERSE },
- { X86::VPBROADCASTDZr, X86::VPBROADCASTDZm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rr, X86::VPBROADCASTDZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rr, X86::VPBROADCASTDZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrr, X86::VPBROADCASTDZrm, TB_NO_REVERSE },
{ X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
{ X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTQZ128r, X86::VPBROADCASTQZ128m, TB_NO_REVERSE },
- { X86::VPBROADCASTQZ256r, X86::VPBROADCASTQZ256m, TB_NO_REVERSE },
- { X86::VPBROADCASTQZr, X86::VPBROADCASTQZm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rr, X86::VPBROADCASTQZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rr, X86::VPBROADCASTQZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrr, X86::VPBROADCASTQZrm, TB_NO_REVERSE },
{ X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
{ X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
- { X86::VPBROADCASTWZ128r, X86::VPBROADCASTWZ128m, TB_NO_REVERSE },
- { X86::VPBROADCASTWZ256r, X86::VPBROADCASTWZ256m, TB_NO_REVERSE },
- { X86::VPBROADCASTWZr, X86::VPBROADCASTWZm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rr, X86::VPBROADCASTWZ128rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rr, X86::VPBROADCASTWZ256rm, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrr, X86::VPBROADCASTWZrm, TB_NO_REVERSE },
{ X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
{ X86::VPCMPESTRMrr, X86::VPCMPESTRMrm, 0 },
@@ -1100,9 +1112,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
{ X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
- { X86::VPSLLDQZ128rr, X86::VPSLLDQZ128rm, 0 },
- { X86::VPSLLDQZ256rr, X86::VPSLLDQZ256rm, 0 },
- { X86::VPSLLDQZrr, X86::VPSLLDQZrm, 0 },
+ { X86::VPSLLDQZ128ri, X86::VPSLLDQZ128mi, 0 },
+ { X86::VPSLLDQZ256ri, X86::VPSLLDQZ256mi, 0 },
+ { X86::VPSLLDQZri, X86::VPSLLDQZmi, 0 },
{ X86::VPSLLDZ128ri, X86::VPSLLDZ128mi, 0 },
{ X86::VPSLLDZ256ri, X86::VPSLLDZ256mi, 0 },
{ X86::VPSLLDZri, X86::VPSLLDZmi, 0 },
@@ -1121,9 +1133,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::VPSRAWZ128ri, X86::VPSRAWZ128mi, 0 },
{ X86::VPSRAWZ256ri, X86::VPSRAWZ256mi, 0 },
{ X86::VPSRAWZri, X86::VPSRAWZmi, 0 },
- { X86::VPSRLDQZ128rr, X86::VPSRLDQZ128rm, 0 },
- { X86::VPSRLDQZ256rr, X86::VPSRLDQZ256rm, 0 },
- { X86::VPSRLDQZrr, X86::VPSRLDQZrm, 0 },
+ { X86::VPSRLDQZ128ri, X86::VPSRLDQZ128mi, 0 },
+ { X86::VPSRLDQZ256ri, X86::VPSRLDQZ256mi, 0 },
+ { X86::VPSRLDQZri, X86::VPSRLDQZmi, 0 },
{ X86::VPSRLDZ128ri, X86::VPSRLDZ128mi, 0 },
{ X86::VPSRLDZ256ri, X86::VPSRLDZ256mi, 0 },
{ X86::VPSRLDZri, X86::VPSRLDZmi, 0 },
@@ -1609,16 +1621,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
{ X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
{ X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
- { X86::VBROADCASTF32X2Z256rkz, X86::VBROADCASTF32X2Z256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTF32X2Zrkz, X86::VBROADCASTF32X2Zmkz, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Z128rkz, X86::VBROADCASTI32X2Z128mkz, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Z256rkz, X86::VBROADCASTI32X2Z256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Zrkz, X86::VBROADCASTI32X2Zmkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
- { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Z256rrkz, X86::VBROADCASTF32X2Z256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrrkz, X86::VBROADCASTF32X2Zrmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rrkz, X86::VBROADCASTI32X2Z128rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rrkz, X86::VBROADCASTI32X2Z256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrrkz, X86::VBROADCASTI32X2Zrmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rrkz, X86::VBROADCASTSDZ256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrrkz, X86::VBROADCASTSDZrmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rrkz, X86::VBROADCASTSSZ128rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rrkz, X86::VBROADCASTSSZ256rmkz, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrrkz, X86::VBROADCASTSSZrmkz, TB_NO_REVERSE },
{ X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
{ X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
{ X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
@@ -2153,18 +2165,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
{ X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
{ X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
- { X86::VPBROADCASTBZ128rkz, X86::VPBROADCASTBZ128mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTBZ256rkz, X86::VPBROADCASTBZ256mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTBZrkz, X86::VPBROADCASTBZmkz, TB_NO_REVERSE },
- { X86::VPBROADCASTDZ128rkz, X86::VPBROADCASTDZ128mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTDZ256rkz, X86::VPBROADCASTDZ256mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTDZrkz, X86::VPBROADCASTDZmkz, TB_NO_REVERSE },
- { X86::VPBROADCASTQZ128rkz, X86::VPBROADCASTQZ128mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTQZ256rkz, X86::VPBROADCASTQZ256mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTQZrkz, X86::VPBROADCASTQZmkz, TB_NO_REVERSE },
- { X86::VPBROADCASTWZ128rkz, X86::VPBROADCASTWZ128mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTWZ256rkz, X86::VPBROADCASTWZ256mkz, TB_NO_REVERSE },
- { X86::VPBROADCASTWZrkz, X86::VPBROADCASTWZmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ128rrkz, X86::VPBROADCASTBZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rrkz, X86::VPBROADCASTBZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrrkz, X86::VPBROADCASTBZrmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rrkz, X86::VPBROADCASTDZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rrkz, X86::VPBROADCASTDZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrrkz, X86::VPBROADCASTDZrmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rrkz, X86::VPBROADCASTQZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rrkz, X86::VPBROADCASTQZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrrkz, X86::VPBROADCASTQZrmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rrkz, X86::VPBROADCASTWZ128rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rrkz, X86::VPBROADCASTWZ256rmkz, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrrkz, X86::VPBROADCASTWZrmkz, TB_NO_REVERSE },
{ X86::VPCLMULQDQYrr, X86::VPCLMULQDQYrm, 0 },
{ X86::VPCLMULQDQZ128rr, X86::VPCLMULQDQZ128rm, 0 },
{ X86::VPCLMULQDQZ256rr, X86::VPCLMULQDQZ256rm, 0 },
@@ -3010,16 +3022,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VBLENDMPSZ128rrk, X86::VBLENDMPSZ128rmk, 0 },
{ X86::VBLENDMPSZ256rrk, X86::VBLENDMPSZ256rmk, 0 },
{ X86::VBLENDMPSZrrk, X86::VBLENDMPSZrmk, 0 },
- { X86::VBROADCASTF32X2Z256rk, X86::VBROADCASTF32X2Z256mk, TB_NO_REVERSE },
- { X86::VBROADCASTF32X2Zrk, X86::VBROADCASTF32X2Zmk, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Z128rk, X86::VBROADCASTI32X2Z128mk, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Z256rk, X86::VBROADCASTI32X2Z256mk, TB_NO_REVERSE },
- { X86::VBROADCASTI32X2Zrk, X86::VBROADCASTI32X2Zmk, TB_NO_REVERSE },
- { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
- { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
- { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
- { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Z256rrk, X86::VBROADCASTF32X2Z256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTF32X2Zrrk, X86::VBROADCASTF32X2Zrmk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z128rrk, X86::VBROADCASTI32X2Z128rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Z256rrk, X86::VBROADCASTI32X2Z256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTI32X2Zrrk, X86::VBROADCASTI32X2Zrmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZ256rrk, X86::VBROADCASTSDZ256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSDZrrk, X86::VBROADCASTSDZrmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ128rrk, X86::VBROADCASTSSZ128rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZ256rrk, X86::VBROADCASTSSZ256rmk, TB_NO_REVERSE },
+ { X86::VBROADCASTSSZrrk, X86::VBROADCASTSSZrmk, TB_NO_REVERSE },
{ X86::VCMPPDZ128rrik, X86::VCMPPDZ128rmik, 0 },
{ X86::VCMPPDZ256rrik, X86::VCMPPDZ256rmik, 0 },
{ X86::VCMPPDZrrik, X86::VCMPPDZrmik, 0 },
@@ -3662,18 +3674,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VPBLENDMWZ128rrk, X86::VPBLENDMWZ128rmk, 0 },
{ X86::VPBLENDMWZ256rrk, X86::VPBLENDMWZ256rmk, 0 },
{ X86::VPBLENDMWZrrk, X86::VPBLENDMWZrmk, 0 },
- { X86::VPBROADCASTBZ128rk, X86::VPBROADCASTBZ128mk, TB_NO_REVERSE },
- { X86::VPBROADCASTBZ256rk, X86::VPBROADCASTBZ256mk, TB_NO_REVERSE },
- { X86::VPBROADCASTBZrk, X86::VPBROADCASTBZmk, TB_NO_REVERSE },
- { X86::VPBROADCASTDZ128rk, X86::VPBROADCASTDZ128mk, TB_NO_REVERSE },
- { X86::VPBROADCASTDZ256rk, X86::VPBROADCASTDZ256mk, TB_NO_REVERSE },
- { X86::VPBROADCASTDZrk, X86::VPBROADCASTDZmk, TB_NO_REVERSE },
- { X86::VPBROADCASTQZ128rk, X86::VPBROADCASTQZ128mk, TB_NO_REVERSE },
- { X86::VPBROADCASTQZ256rk, X86::VPBROADCASTQZ256mk, TB_NO_REVERSE },
- { X86::VPBROADCASTQZrk, X86::VPBROADCASTQZmk, TB_NO_REVERSE },
- { X86::VPBROADCASTWZ128rk, X86::VPBROADCASTWZ128mk, TB_NO_REVERSE },
- { X86::VPBROADCASTWZ256rk, X86::VPBROADCASTWZ256mk, TB_NO_REVERSE },
- { X86::VPBROADCASTWZrk, X86::VPBROADCASTWZmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ128rrk, X86::VPBROADCASTBZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZ256rrk, X86::VPBROADCASTBZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTBZrrk, X86::VPBROADCASTBZrmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ128rrk, X86::VPBROADCASTDZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZ256rrk, X86::VPBROADCASTDZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTDZrrk, X86::VPBROADCASTDZrmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ128rrk, X86::VPBROADCASTQZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZ256rrk, X86::VPBROADCASTQZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTQZrrk, X86::VPBROADCASTQZrmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ128rrk, X86::VPBROADCASTWZ128rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZ256rrk, X86::VPBROADCASTWZ256rmk, TB_NO_REVERSE },
+ { X86::VPBROADCASTWZrrk, X86::VPBROADCASTWZrmk, TB_NO_REVERSE },
{ X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 },
{ X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
{ X86::VPCMPBZ128rrik, X86::VPCMPBZ128rmik, 0 },
@@ -5509,6 +5521,12 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = {
{ X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS },
{ X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS },
{ X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS },
+ { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmbi, TB_BCAST_D },
+ { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmbi, TB_BCAST_D },
+ { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmbi, TB_BCAST_D },
+ { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmbi, TB_BCAST_Q },
+ { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmbi, TB_BCAST_Q },
+ { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmbi, TB_BCAST_Q },
};
static const X86MemoryFoldTableEntry *
@@ -5517,53 +5535,45 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
// Make sure the tables are sorted.
static std::atomic<bool> FoldTablesChecked(false);
if (!FoldTablesChecked.load(std::memory_order_relaxed)) {
- assert(std::is_sorted(std::begin(MemoryFoldTable2Addr),
- std::end(MemoryFoldTable2Addr)) &&
+ assert(llvm::is_sorted(MemoryFoldTable2Addr) &&
std::adjacent_find(std::begin(MemoryFoldTable2Addr),
std::end(MemoryFoldTable2Addr)) ==
- std::end(MemoryFoldTable2Addr) &&
+ std::end(MemoryFoldTable2Addr) &&
"MemoryFoldTable2Addr is not sorted and unique!");
- assert(std::is_sorted(std::begin(MemoryFoldTable0),
- std::end(MemoryFoldTable0)) &&
+ assert(llvm::is_sorted(MemoryFoldTable0) &&
std::adjacent_find(std::begin(MemoryFoldTable0),
std::end(MemoryFoldTable0)) ==
- std::end(MemoryFoldTable0) &&
+ std::end(MemoryFoldTable0) &&
"MemoryFoldTable0 is not sorted and unique!");
- assert(std::is_sorted(std::begin(MemoryFoldTable1),
- std::end(MemoryFoldTable1)) &&
+ assert(llvm::is_sorted(MemoryFoldTable1) &&
std::adjacent_find(std::begin(MemoryFoldTable1),
std::end(MemoryFoldTable1)) ==
- std::end(MemoryFoldTable1) &&
+ std::end(MemoryFoldTable1) &&
"MemoryFoldTable1 is not sorted and unique!");
- assert(std::is_sorted(std::begin(MemoryFoldTable2),
- std::end(MemoryFoldTable2)) &&
+ assert(llvm::is_sorted(MemoryFoldTable2) &&
std::adjacent_find(std::begin(MemoryFoldTable2),
std::end(MemoryFoldTable2)) ==
- std::end(MemoryFoldTable2) &&
+ std::end(MemoryFoldTable2) &&
"MemoryFoldTable2 is not sorted and unique!");
- assert(std::is_sorted(std::begin(MemoryFoldTable3),
- std::end(MemoryFoldTable3)) &&
+ assert(llvm::is_sorted(MemoryFoldTable3) &&
std::adjacent_find(std::begin(MemoryFoldTable3),
std::end(MemoryFoldTable3)) ==
- std::end(MemoryFoldTable3) &&
+ std::end(MemoryFoldTable3) &&
"MemoryFoldTable3 is not sorted and unique!");
- assert(std::is_sorted(std::begin(MemoryFoldTable4),
- std::end(MemoryFoldTable4)) &&
+ assert(llvm::is_sorted(MemoryFoldTable4) &&
std::adjacent_find(std::begin(MemoryFoldTable4),
std::end(MemoryFoldTable4)) ==
- std::end(MemoryFoldTable4) &&
+ std::end(MemoryFoldTable4) &&
"MemoryFoldTable4 is not sorted and unique!");
- assert(std::is_sorted(std::begin(BroadcastFoldTable2),
- std::end(BroadcastFoldTable2)) &&
+ assert(llvm::is_sorted(BroadcastFoldTable2) &&
std::adjacent_find(std::begin(BroadcastFoldTable2),
std::end(BroadcastFoldTable2)) ==
- std::end(BroadcastFoldTable2) &&
+ std::end(BroadcastFoldTable2) &&
"BroadcastFoldTable2 is not sorted and unique!");
- assert(std::is_sorted(std::begin(BroadcastFoldTable3),
- std::end(BroadcastFoldTable3)) &&
+ assert(llvm::is_sorted(BroadcastFoldTable3) &&
std::adjacent_find(std::begin(BroadcastFoldTable3),
std::end(BroadcastFoldTable3)) ==
- std::end(BroadcastFoldTable3) &&
+ std::end(BroadcastFoldTable3) &&
"BroadcastFoldTable3 is not sorted and unique!");
FoldTablesChecked.store(true, std::memory_order_relaxed);
}
@@ -5639,7 +5649,7 @@ struct X86MemUnfoldTable {
addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3)
- // Index 2, folded broadcast
+ // Index 3, folded broadcast
addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
// Sort the memory->reg unfold table.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h
index 7dc236a0d7e4..b7aca27ab2bb 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -13,7 +13,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
#define LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
-#include "llvm/Support/DataTypes.h"
+#include <cstdint>
namespace llvm {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
index 2f797fcfb8de..d7752e656b55 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td
@@ -27,26 +27,33 @@ def RawFrmDstSrc : Format<6>;
def RawFrmImm8 : Format<7>;
def RawFrmImm16 : Format<8>;
def AddCCFrm : Format<9>;
-def MRMDestMem : Format<32>;
-def MRMSrcMem : Format<33>;
-def MRMSrcMem4VOp3 : Format<34>;
-def MRMSrcMemOp4 : Format<35>;
-def MRMSrcMemCC : Format<36>;
-def MRMXmCC: Format<38>;
-def MRMXm : Format<39>;
-def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>;
-def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>;
-def MRM6m : Format<46>; def MRM7m : Format<47>;
-def MRMDestReg : Format<48>;
-def MRMSrcReg : Format<49>;
-def MRMSrcReg4VOp3 : Format<50>;
-def MRMSrcRegOp4 : Format<51>;
-def MRMSrcRegCC : Format<52>;
-def MRMXrCC: Format<54>;
-def MRMXr : Format<55>;
-def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>;
-def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>;
-def MRM6r : Format<62>; def MRM7r : Format<63>;
+def PrefixByte : Format<10>;
+def MRMr0 : Format<21>;
+def MRMSrcMemFSIB : Format<22>;
+def MRMDestMemFSIB : Format<23>;
+def MRMDestMem : Format<24>;
+def MRMSrcMem : Format<25>;
+def MRMSrcMem4VOp3 : Format<26>;
+def MRMSrcMemOp4 : Format<27>;
+def MRMSrcMemCC : Format<28>;
+def MRMXmCC: Format<30>;
+def MRMXm : Format<31>;
+def MRM0m : Format<32>; def MRM1m : Format<33>; def MRM2m : Format<34>;
+def MRM3m : Format<35>; def MRM4m : Format<36>; def MRM5m : Format<37>;
+def MRM6m : Format<38>; def MRM7m : Format<39>;
+def MRMDestReg : Format<40>;
+def MRMSrcReg : Format<41>;
+def MRMSrcReg4VOp3 : Format<42>;
+def MRMSrcRegOp4 : Format<43>;
+def MRMSrcRegCC : Format<44>;
+def MRMXrCC: Format<46>;
+def MRMXr : Format<47>;
+def MRM0r : Format<48>; def MRM1r : Format<49>; def MRM2r : Format<50>;
+def MRM3r : Format<51>; def MRM4r : Format<52>; def MRM5r : Format<53>;
+def MRM6r : Format<54>; def MRM7r : Format<55>;
+def MRM0X : Format<56>; def MRM1X : Format<57>; def MRM2X : Format<58>;
+def MRM3X : Format<59>; def MRM4X : Format<60>; def MRM5X : Format<61>;
+def MRM6X : Format<62>; def MRM7X : Format<63>;
def MRM_C0 : Format<64>; def MRM_C1 : Format<65>; def MRM_C2 : Format<66>;
def MRM_C3 : Format<67>; def MRM_C4 : Format<68>; def MRM_C5 : Format<69>;
def MRM_C6 : Format<70>; def MRM_C7 : Format<71>; def MRM_C8 : Format<72>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 3250123e5aa6..f3f7d17d9b3c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -59,9 +59,13 @@ def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>;
-def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>;
-def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>;
-def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86FCmp>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86FCmp>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>;
+
def X86pshufb : SDNode<"X86ISD::PSHUFB",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>]>>;
@@ -535,8 +539,20 @@ def X86any_Fmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
[(X86strict_Fmadd node:$src1, node:$src2, node:$src3),
(X86Fmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fnmadd node:$src1, node:$src2, node:$src3),
+ (X86Fnmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fmsub : SDNode<"X86ISD::STRICT_FMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fmsub node:$src1, node:$src2, node:$src3),
+ (X86Fmsub node:$src1, node:$src2, node:$src3)]>;
def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmsub : SDNode<"X86ISD::STRICT_FNMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fnmsub node:$src1, node:$src2, node:$src3),
+ (X86Fnmsub node:$src1, node:$src2, node:$src3)]>;
def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>;
@@ -709,19 +725,27 @@ def X86mcvtp2UInt : SDNode<"X86ISD::MCVTP2UI", SDTMFloatToInt>;
def X86mcvttp2si : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>;
def X86mcvttp2ui : SDNode<"X86ISD::MCVTTP2UI", SDTMFloatToInt>;
+def SDTcvtph2ps : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>]>;
+def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", SDTcvtph2ps>;
+def X86strict_cvtph2ps : SDNode<"X86ISD::STRICT_CVTPH2PS", SDTcvtph2ps,
+ [SDNPHasChain]>;
+def X86any_cvtph2ps : PatFrags<(ops node:$src),
+ [(X86strict_cvtph2ps node:$src),
+ (X86cvtph2ps node:$src)]>;
+
+def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE", SDTcvtph2ps>;
+
+def SDTcvtps2ph : SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisVT<2, i32>]>;
+def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", SDTcvtps2ph>;
+def X86strict_cvtps2ph : SDNode<"X86ISD::STRICT_CVTPS2PH", SDTcvtps2ph,
+ [SDNPHasChain]>;
+def X86any_cvtps2ph : PatFrags<(ops node:$src1, node:$src2),
+ [(X86strict_cvtps2ph node:$src1, node:$src2),
+ (X86cvtps2ph node:$src1, node:$src2)]>;
-def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
- SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
- SDTCVecEltisVT<1, i16>]> >;
-
-def X86cvtph2psSAE : SDNode<"X86ISD::CVTPH2PS_SAE",
- SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
- SDTCVecEltisVT<1, i16>]> >;
-
-def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH",
- SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
- SDTCVecEltisVT<1, f32>,
- SDTCisVT<2, i32>]> >;
def X86mcvtps2ph : SDNode<"X86ISD::MCVTPS2PH",
SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>,
SDTCVecEltisVT<1, f32>,
@@ -741,7 +765,9 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
// cvt fp to bfloat16
def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameSizeAs<0,1>,
SDTCisSameAs<1,2>]>>;
def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16",
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
@@ -768,23 +794,6 @@ def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store
]>;
//===----------------------------------------------------------------------===//
-// SSE Complex Patterns
-//===----------------------------------------------------------------------===//
-
-// These are 'extloads' from a scalar to the low element of a vector, zeroing
-// the top elements. These are used for the SSE 'ss' and 'sd' instruction
-// forms.
-def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
- SDNPWantRoot, SDNPWantParent]>;
-def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
- SDNPWantRoot, SDNPWantParent]>;
-
-def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
-def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
-
-//===----------------------------------------------------------------------===//
// SSE pattern fragments
//===----------------------------------------------------------------------===//
@@ -895,89 +904,6 @@ def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
-def X86masked_gather : SDNode<"X86ISD::MGATHER",
- SDTypeProfile<2, 3, [SDTCisVec<0>,
- SDTCisVec<1>, SDTCisInt<1>,
- SDTCisSameAs<0, 2>,
- SDTCisSameAs<1, 3>,
- SDTCisPtrTy<4>]>,
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
-def X86masked_scatter : SDNode<"X86ISD::MSCATTER",
- SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisSameAs<0, 2>,
- SDTCVecEltisVT<0, i1>,
- SDTCisPtrTy<3>]>,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-
-def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
- X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
- return Mgt->getIndex().getValueType() == MVT::v4i32;
-}]>;
-
-def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
- X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
- return Mgt->getIndex().getValueType() == MVT::v8i32;
-}]>;
-
-def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
- X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
- return Mgt->getIndex().getValueType() == MVT::v2i64;
-}]>;
-def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
- X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
- return Mgt->getIndex().getValueType() == MVT::v4i64;
-}]>;
-def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
- X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
- return Mgt->getIndex().getValueType() == MVT::v8i64;
-}]>;
-def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
- X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
- return Mgt->getIndex().getValueType() == MVT::v16i32;
-}]>;
-
-def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
- X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
- return Sc->getIndex().getValueType() == MVT::v2i64;
-}]>;
-
-def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
- X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
- return Sc->getIndex().getValueType() == MVT::v4i32;
-}]>;
-
-def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
- X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
- return Sc->getIndex().getValueType() == MVT::v4i64;
-}]>;
-
-def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
- X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
- return Sc->getIndex().getValueType() == MVT::v8i32;
-}]>;
-
-def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
- X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
- return Sc->getIndex().getValueType() == MVT::v8i64;
-}]>;
-def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
- X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
- return Sc->getIndex().getValueType() == MVT::v16i32;
-}]>;
-
// 128-bit bitconvert pattern fragments
def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
@@ -1037,6 +963,23 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src),
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
+// Scalar SSE intrinsic fragments to match several different types of loads.
+// Used by scalar SSE intrinsic instructions which have 128 bit types, but
+// only load a single element.
+// FIXME: We should add more canolicalizing in DAGCombine. Particulary removing
+// the simple_load case.
+def sse_load_f32 : PatFrags<(ops node:$ptr),
+ [(v4f32 (simple_load node:$ptr)),
+ (v4f32 (X86vzload32 node:$ptr)),
+ (v4f32 (scalar_to_vector (loadf32 node:$ptr)))]>;
+def sse_load_f64 : PatFrags<(ops node:$ptr),
+ [(v2f64 (simple_load node:$ptr)),
+ (v2f64 (X86vzload64 node:$ptr)),
+ (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>;
+
+def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+
def fp32imm0 : PatLeaf<(f32 fpimm), [{
return N->isExactlyValue(+0.0);
@@ -1185,60 +1128,60 @@ def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTX86MaskedStore,
def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
(X86TruncSStore node:$val, node:$ptr), [{
- return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr),
(X86TruncUSStore node:$val, node:$ptr), [{
- return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr),
(X86TruncSStore node:$val, node:$ptr), [{
- return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
}]>;
def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr),
(X86TruncUSStore node:$val, node:$ptr), [{
- return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
}]>;
def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr),
(X86TruncSStore node:$val, node:$ptr), [{
- return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr),
(X86TruncUSStore node:$val, node:$ptr), [{
- return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
}]>;
def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
}]>;
def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
}]>;
def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
index 90484241c28c..42c111173570 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -88,7 +88,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
bool
X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
+ Register &SrcReg, Register &DstReg,
unsigned &SubIdx) const {
switch (MI.getOpcode()) {
default: break;
@@ -135,13 +135,497 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
return false;
}
+bool X86InstrInfo::isDataInvariant(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ // By default, assume that the instruction is not data invariant.
+ return false;
+
+ // Some target-independent operations that trivially lower to data-invariant
+ // instructions.
+ case TargetOpcode::COPY:
+ case TargetOpcode::INSERT_SUBREG:
+ case TargetOpcode::SUBREG_TO_REG:
+ return true;
+
+ // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+ // However, they set flags and are perhaps the most surprisingly constant
+ // time operations so we call them out here separately.
+ case X86::IMUL16rr:
+ case X86::IMUL16rri8:
+ case X86::IMUL16rri:
+ case X86::IMUL32rr:
+ case X86::IMUL32rri8:
+ case X86::IMUL32rri:
+ case X86::IMUL64rr:
+ case X86::IMUL64rri32:
+ case X86::IMUL64rri8:
+
+ // Bit scanning and counting instructions that are somewhat surprisingly
+ // constant time as they scan across bits and do other fairly complex
+ // operations like popcnt, but are believed to be constant time on x86.
+ // However, these set flags.
+ case X86::BSF16rr:
+ case X86::BSF32rr:
+ case X86::BSF64rr:
+ case X86::BSR16rr:
+ case X86::BSR32rr:
+ case X86::BSR64rr:
+ case X86::LZCNT16rr:
+ case X86::LZCNT32rr:
+ case X86::LZCNT64rr:
+ case X86::POPCNT16rr:
+ case X86::POPCNT32rr:
+ case X86::POPCNT64rr:
+ case X86::TZCNT16rr:
+ case X86::TZCNT32rr:
+ case X86::TZCNT64rr:
+
+ // Bit manipulation instructions are effectively combinations of basic
+ // arithmetic ops, and should still execute in constant time. These also
+ // set flags.
+ case X86::BLCFILL32rr:
+ case X86::BLCFILL64rr:
+ case X86::BLCI32rr:
+ case X86::BLCI64rr:
+ case X86::BLCIC32rr:
+ case X86::BLCIC64rr:
+ case X86::BLCMSK32rr:
+ case X86::BLCMSK64rr:
+ case X86::BLCS32rr:
+ case X86::BLCS64rr:
+ case X86::BLSFILL32rr:
+ case X86::BLSFILL64rr:
+ case X86::BLSI32rr:
+ case X86::BLSI64rr:
+ case X86::BLSIC32rr:
+ case X86::BLSIC64rr:
+ case X86::BLSMSK32rr:
+ case X86::BLSMSK64rr:
+ case X86::BLSR32rr:
+ case X86::BLSR64rr:
+ case X86::TZMSK32rr:
+ case X86::TZMSK64rr:
+
+ // Bit extracting and clearing instructions should execute in constant time,
+ // and set flags.
+ case X86::BEXTR32rr:
+ case X86::BEXTR64rr:
+ case X86::BEXTRI32ri:
+ case X86::BEXTRI64ri:
+ case X86::BZHI32rr:
+ case X86::BZHI64rr:
+
+ // Shift and rotate.
+ case X86::ROL8r1:
+ case X86::ROL16r1:
+ case X86::ROL32r1:
+ case X86::ROL64r1:
+ case X86::ROL8rCL:
+ case X86::ROL16rCL:
+ case X86::ROL32rCL:
+ case X86::ROL64rCL:
+ case X86::ROL8ri:
+ case X86::ROL16ri:
+ case X86::ROL32ri:
+ case X86::ROL64ri:
+ case X86::ROR8r1:
+ case X86::ROR16r1:
+ case X86::ROR32r1:
+ case X86::ROR64r1:
+ case X86::ROR8rCL:
+ case X86::ROR16rCL:
+ case X86::ROR32rCL:
+ case X86::ROR64rCL:
+ case X86::ROR8ri:
+ case X86::ROR16ri:
+ case X86::ROR32ri:
+ case X86::ROR64ri:
+ case X86::SAR8r1:
+ case X86::SAR16r1:
+ case X86::SAR32r1:
+ case X86::SAR64r1:
+ case X86::SAR8rCL:
+ case X86::SAR16rCL:
+ case X86::SAR32rCL:
+ case X86::SAR64rCL:
+ case X86::SAR8ri:
+ case X86::SAR16ri:
+ case X86::SAR32ri:
+ case X86::SAR64ri:
+ case X86::SHL8r1:
+ case X86::SHL16r1:
+ case X86::SHL32r1:
+ case X86::SHL64r1:
+ case X86::SHL8rCL:
+ case X86::SHL16rCL:
+ case X86::SHL32rCL:
+ case X86::SHL64rCL:
+ case X86::SHL8ri:
+ case X86::SHL16ri:
+ case X86::SHL32ri:
+ case X86::SHL64ri:
+ case X86::SHR8r1:
+ case X86::SHR16r1:
+ case X86::SHR32r1:
+ case X86::SHR64r1:
+ case X86::SHR8rCL:
+ case X86::SHR16rCL:
+ case X86::SHR32rCL:
+ case X86::SHR64rCL:
+ case X86::SHR8ri:
+ case X86::SHR16ri:
+ case X86::SHR32ri:
+ case X86::SHR64ri:
+ case X86::SHLD16rrCL:
+ case X86::SHLD32rrCL:
+ case X86::SHLD64rrCL:
+ case X86::SHLD16rri8:
+ case X86::SHLD32rri8:
+ case X86::SHLD64rri8:
+ case X86::SHRD16rrCL:
+ case X86::SHRD32rrCL:
+ case X86::SHRD64rrCL:
+ case X86::SHRD16rri8:
+ case X86::SHRD32rri8:
+ case X86::SHRD64rri8:
+
+ // Basic arithmetic is constant time on the input but does set flags.
+ case X86::ADC8rr:
+ case X86::ADC8ri:
+ case X86::ADC16rr:
+ case X86::ADC16ri:
+ case X86::ADC16ri8:
+ case X86::ADC32rr:
+ case X86::ADC32ri:
+ case X86::ADC32ri8:
+ case X86::ADC64rr:
+ case X86::ADC64ri8:
+ case X86::ADC64ri32:
+ case X86::ADD8rr:
+ case X86::ADD8ri:
+ case X86::ADD16rr:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD32rr:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD64rr:
+ case X86::ADD64ri8:
+ case X86::ADD64ri32:
+ case X86::AND8rr:
+ case X86::AND8ri:
+ case X86::AND16rr:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND32rr:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND64rr:
+ case X86::AND64ri8:
+ case X86::AND64ri32:
+ case X86::OR8rr:
+ case X86::OR8ri:
+ case X86::OR16rr:
+ case X86::OR16ri:
+ case X86::OR16ri8:
+ case X86::OR32rr:
+ case X86::OR32ri:
+ case X86::OR32ri8:
+ case X86::OR64rr:
+ case X86::OR64ri8:
+ case X86::OR64ri32:
+ case X86::SBB8rr:
+ case X86::SBB8ri:
+ case X86::SBB16rr:
+ case X86::SBB16ri:
+ case X86::SBB16ri8:
+ case X86::SBB32rr:
+ case X86::SBB32ri:
+ case X86::SBB32ri8:
+ case X86::SBB64rr:
+ case X86::SBB64ri8:
+ case X86::SBB64ri32:
+ case X86::SUB8rr:
+ case X86::SUB8ri:
+ case X86::SUB16rr:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB32rr:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB64rr:
+ case X86::SUB64ri8:
+ case X86::SUB64ri32:
+ case X86::XOR8rr:
+ case X86::XOR8ri:
+ case X86::XOR16rr:
+ case X86::XOR16ri:
+ case X86::XOR16ri8:
+ case X86::XOR32rr:
+ case X86::XOR32ri:
+ case X86::XOR32ri8:
+ case X86::XOR64rr:
+ case X86::XOR64ri8:
+ case X86::XOR64ri32:
+ // Arithmetic with just 32-bit and 64-bit variants and no immediates.
+ case X86::ADCX32rr:
+ case X86::ADCX64rr:
+ case X86::ADOX32rr:
+ case X86::ADOX64rr:
+ case X86::ANDN32rr:
+ case X86::ANDN64rr:
+ // Unary arithmetic operations.
+ case X86::DEC8r:
+ case X86::DEC16r:
+ case X86::DEC32r:
+ case X86::DEC64r:
+ case X86::INC8r:
+ case X86::INC16r:
+ case X86::INC32r:
+ case X86::INC64r:
+ case X86::NEG8r:
+ case X86::NEG16r:
+ case X86::NEG32r:
+ case X86::NEG64r:
+
+ // Unlike other arithmetic, NOT doesn't set EFLAGS.
+ case X86::NOT8r:
+ case X86::NOT16r:
+ case X86::NOT32r:
+ case X86::NOT64r:
+
+ // Various move instructions used to zero or sign extend things. Note that we
+ // intentionally don't support the _NOREX variants as we can't handle that
+ // register constraint anyways.
+ case X86::MOVSX16rr8:
+ case X86::MOVSX32rr8:
+ case X86::MOVSX32rr16:
+ case X86::MOVSX64rr8:
+ case X86::MOVSX64rr16:
+ case X86::MOVSX64rr32:
+ case X86::MOVZX16rr8:
+ case X86::MOVZX32rr8:
+ case X86::MOVZX32rr16:
+ case X86::MOVZX64rr8:
+ case X86::MOVZX64rr16:
+ case X86::MOV32rr:
+
+ // Arithmetic instructions that are both constant time and don't set flags.
+ case X86::RORX32ri:
+ case X86::RORX64ri:
+ case X86::SARX32rr:
+ case X86::SARX64rr:
+ case X86::SHLX32rr:
+ case X86::SHLX64rr:
+ case X86::SHRX32rr:
+ case X86::SHRX64rr:
+
+ // LEA doesn't actually access memory, and its arithmetic is constant time.
+ case X86::LEA16r:
+ case X86::LEA32r:
+ case X86::LEA64_32r:
+ case X86::LEA64r:
+ return true;
+ }
+}
+
+bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ default:
+ // By default, assume that the load will immediately leak.
+ return false;
+
+ // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+ // However, they set flags and are perhaps the most surprisingly constant
+ // time operations so we call them out here separately.
+ case X86::IMUL16rm:
+ case X86::IMUL16rmi8:
+ case X86::IMUL16rmi:
+ case X86::IMUL32rm:
+ case X86::IMUL32rmi8:
+ case X86::IMUL32rmi:
+ case X86::IMUL64rm:
+ case X86::IMUL64rmi32:
+ case X86::IMUL64rmi8:
+
+ // Bit scanning and counting instructions that are somewhat surprisingly
+ // constant time as they scan across bits and do other fairly complex
+ // operations like popcnt, but are believed to be constant time on x86.
+ // However, these set flags.
+ case X86::BSF16rm:
+ case X86::BSF32rm:
+ case X86::BSF64rm:
+ case X86::BSR16rm:
+ case X86::BSR32rm:
+ case X86::BSR64rm:
+ case X86::LZCNT16rm:
+ case X86::LZCNT32rm:
+ case X86::LZCNT64rm:
+ case X86::POPCNT16rm:
+ case X86::POPCNT32rm:
+ case X86::POPCNT64rm:
+ case X86::TZCNT16rm:
+ case X86::TZCNT32rm:
+ case X86::TZCNT64rm:
+
+ // Bit manipulation instructions are effectively combinations of basic
+ // arithmetic ops, and should still execute in constant time. These also
+ // set flags.
+ case X86::BLCFILL32rm:
+ case X86::BLCFILL64rm:
+ case X86::BLCI32rm:
+ case X86::BLCI64rm:
+ case X86::BLCIC32rm:
+ case X86::BLCIC64rm:
+ case X86::BLCMSK32rm:
+ case X86::BLCMSK64rm:
+ case X86::BLCS32rm:
+ case X86::BLCS64rm:
+ case X86::BLSFILL32rm:
+ case X86::BLSFILL64rm:
+ case X86::BLSI32rm:
+ case X86::BLSI64rm:
+ case X86::BLSIC32rm:
+ case X86::BLSIC64rm:
+ case X86::BLSMSK32rm:
+ case X86::BLSMSK64rm:
+ case X86::BLSR32rm:
+ case X86::BLSR64rm:
+ case X86::TZMSK32rm:
+ case X86::TZMSK64rm:
+
+ // Bit extracting and clearing instructions should execute in constant time,
+ // and set flags.
+ case X86::BEXTR32rm:
+ case X86::BEXTR64rm:
+ case X86::BEXTRI32mi:
+ case X86::BEXTRI64mi:
+ case X86::BZHI32rm:
+ case X86::BZHI64rm:
+
+ // Basic arithmetic is constant time on the input but does set flags.
+ case X86::ADC8rm:
+ case X86::ADC16rm:
+ case X86::ADC32rm:
+ case X86::ADC64rm:
+ case X86::ADCX32rm:
+ case X86::ADCX64rm:
+ case X86::ADD8rm:
+ case X86::ADD16rm:
+ case X86::ADD32rm:
+ case X86::ADD64rm:
+ case X86::ADOX32rm:
+ case X86::ADOX64rm:
+ case X86::AND8rm:
+ case X86::AND16rm:
+ case X86::AND32rm:
+ case X86::AND64rm:
+ case X86::ANDN32rm:
+ case X86::ANDN64rm:
+ case X86::OR8rm:
+ case X86::OR16rm:
+ case X86::OR32rm:
+ case X86::OR64rm:
+ case X86::SBB8rm:
+ case X86::SBB16rm:
+ case X86::SBB32rm:
+ case X86::SBB64rm:
+ case X86::SUB8rm:
+ case X86::SUB16rm:
+ case X86::SUB32rm:
+ case X86::SUB64rm:
+ case X86::XOR8rm:
+ case X86::XOR16rm:
+ case X86::XOR32rm:
+ case X86::XOR64rm:
+
+ // Integer multiply w/o affecting flags is still believed to be constant
+ // time on x86. Called out separately as this is among the most surprising
+ // instructions to exhibit that behavior.
+ case X86::MULX32rm:
+ case X86::MULX64rm:
+
+ // Arithmetic instructions that are both constant time and don't set flags.
+ case X86::RORX32mi:
+ case X86::RORX64mi:
+ case X86::SARX32rm:
+ case X86::SARX64rm:
+ case X86::SHLX32rm:
+ case X86::SHLX64rm:
+ case X86::SHRX32rm:
+ case X86::SHRX64rm:
+
+ // Conversions are believed to be constant time and don't set flags.
+ case X86::CVTTSD2SI64rm:
+ case X86::VCVTTSD2SI64rm:
+ case X86::VCVTTSD2SI64Zrm:
+ case X86::CVTTSD2SIrm:
+ case X86::VCVTTSD2SIrm:
+ case X86::VCVTTSD2SIZrm:
+ case X86::CVTTSS2SI64rm:
+ case X86::VCVTTSS2SI64rm:
+ case X86::VCVTTSS2SI64Zrm:
+ case X86::CVTTSS2SIrm:
+ case X86::VCVTTSS2SIrm:
+ case X86::VCVTTSS2SIZrm:
+ case X86::CVTSI2SDrm:
+ case X86::VCVTSI2SDrm:
+ case X86::VCVTSI2SDZrm:
+ case X86::CVTSI2SSrm:
+ case X86::VCVTSI2SSrm:
+ case X86::VCVTSI2SSZrm:
+ case X86::CVTSI642SDrm:
+ case X86::VCVTSI642SDrm:
+ case X86::VCVTSI642SDZrm:
+ case X86::CVTSI642SSrm:
+ case X86::VCVTSI642SSrm:
+ case X86::VCVTSI642SSZrm:
+ case X86::CVTSS2SDrm:
+ case X86::VCVTSS2SDrm:
+ case X86::VCVTSS2SDZrm:
+ case X86::CVTSD2SSrm:
+ case X86::VCVTSD2SSrm:
+ case X86::VCVTSD2SSZrm:
+ // AVX512 added unsigned integer conversions.
+ case X86::VCVTTSD2USI64Zrm:
+ case X86::VCVTTSD2USIZrm:
+ case X86::VCVTTSS2USI64Zrm:
+ case X86::VCVTTSS2USIZrm:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI642SSZrm:
+
+ // Loads to register don't set flags.
+ case X86::MOV8rm:
+ case X86::MOV8rm_NOREX:
+ case X86::MOV16rm:
+ case X86::MOV32rm:
+ case X86::MOV64rm:
+ case X86::MOVSX16rm8:
+ case X86::MOVSX32rm16:
+ case X86::MOVSX32rm8:
+ case X86::MOVSX32rm8_NOREX:
+ case X86::MOVSX64rm16:
+ case X86::MOVSX64rm32:
+ case X86::MOVSX64rm8:
+ case X86::MOVZX16rm8:
+ case X86::MOVZX32rm16:
+ case X86::MOVZX32rm8:
+ case X86::MOVZX32rm8_NOREX:
+ case X86::MOVZX64rm16:
+ case X86::MOVZX64rm8:
+ return true;
+ }
+}
+
int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
const MachineFunction *MF = MI.getParent()->getParent();
const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
if (isFrameInstr(MI)) {
- unsigned StackAlign = TFI->getStackAlignment();
- int SPAdj = alignTo(getFrameSize(MI), StackAlign);
+ int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
SPAdj -= getFrameAdjustment(MI);
if (!isFrameSetup(MI))
SPAdj = -SPAdj;
@@ -639,7 +1123,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- unsigned DestReg, unsigned SubIdx,
+ Register DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
@@ -1182,61 +1666,61 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
case X86::VMOVUPSZ128rmk: case X86::VMOVAPSZ128rmk:
case X86::VMOVUPSZ256rmk: case X86::VMOVAPSZ256rmk:
case X86::VMOVUPSZrmk: case X86::VMOVAPSZrmk:
- case X86::VBROADCASTSDZ256mk:
- case X86::VBROADCASTSDZmk:
- case X86::VBROADCASTSSZ128mk:
- case X86::VBROADCASTSSZ256mk:
- case X86::VBROADCASTSSZmk:
- case X86::VPBROADCASTDZ128mk:
- case X86::VPBROADCASTDZ256mk:
- case X86::VPBROADCASTDZmk:
- case X86::VPBROADCASTQZ128mk:
- case X86::VPBROADCASTQZ256mk:
- case X86::VPBROADCASTQZmk: {
+ case X86::VBROADCASTSDZ256rmk:
+ case X86::VBROADCASTSDZrmk:
+ case X86::VBROADCASTSSZ128rmk:
+ case X86::VBROADCASTSSZ256rmk:
+ case X86::VBROADCASTSSZrmk:
+ case X86::VPBROADCASTDZ128rmk:
+ case X86::VPBROADCASTDZ256rmk:
+ case X86::VPBROADCASTDZrmk:
+ case X86::VPBROADCASTQZ128rmk:
+ case X86::VPBROADCASTQZ256rmk:
+ case X86::VPBROADCASTQZrmk: {
unsigned Opc;
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
- case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
- case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
- case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
- case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
- case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
- case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
- case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
- case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
- case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
- case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
- case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
- case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
- case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
- case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
- case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
- case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
- case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
- case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
- case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
- case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
- case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
- case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
- case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
- case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
- case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
- case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
- case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
- case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
- case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
- case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
- case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break;
- case X86::VBROADCASTSDZmk: Opc = X86::VBLENDMPDZrmbk; break;
- case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break;
- case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break;
- case X86::VBROADCASTSSZmk: Opc = X86::VBLENDMPSZrmbk; break;
- case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break;
- case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break;
- case X86::VPBROADCASTDZmk: Opc = X86::VPBLENDMDZrmbk; break;
- case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break;
- case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break;
- case X86::VPBROADCASTQZmk: Opc = X86::VPBLENDMQZrmbk; break;
+ case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break;
+ case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break;
+ case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break;
+ case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break;
+ case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break;
+ case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break;
+ case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break;
+ case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break;
+ case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break;
+ case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break;
+ case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break;
+ case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break;
+ case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break;
+ case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break;
+ case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break;
+ case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break;
+ case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break;
+ case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break;
+ case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
+ case X86::VBROADCASTSDZrmk: Opc = X86::VBLENDMPDZrmbk; break;
+ case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
+ case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
+ case X86::VBROADCASTSSZrmk: Opc = X86::VBLENDMPSZrmbk; break;
+ case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
+ case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
+ case X86::VPBROADCASTDZrmk: Opc = X86::VPBLENDMDZrmbk; break;
+ case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
+ case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
+ case X86::VPBROADCASTQZrmk: Opc = X86::VPBLENDMQZrmbk; break;
}
NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1883,7 +2367,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
unsigned KMaskOp = -1U;
if (X86II::isKMasked(TSFlags)) {
// For k-zero-masked operations it is Ok to commute the first vector
- // operand.
+ // operand. Unless this is an intrinsic instruction.
// For regular k-masked operations a conservative choice is done as the
// elements of the first vector operand, for which the corresponding bit
// in the k-mask operand is set to 0, are copied to the result of the
@@ -1902,7 +2386,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
// The operand with index = 1 is used as a source for those elements for
// which the corresponding bit in the k-mask is set to 0.
- if (X86II::isKMergeMasked(TSFlags))
+ if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
FirstCommutableVecOp = 3;
LastCommutableVecOp++;
@@ -2379,17 +2863,6 @@ unsigned X86::getSwappedVCMPImm(unsigned Imm) {
return Imm;
}
-bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
- if (!MI.isTerminator()) return false;
-
- // Conditional branch is a special case.
- if (MI.isBranch() && !MI.isBarrier())
- return true;
- if (!MI.isPredicable())
- return true;
- return !isPredicated(MI);
-}
-
bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case X86::TCRETURNdi:
@@ -2826,11 +3299,11 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
return Count;
}
-bool X86InstrInfo::
-canInsertSelect(const MachineBasicBlock &MBB,
- ArrayRef<MachineOperand> Cond,
- unsigned TrueReg, unsigned FalseReg,
- int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+ ArrayRef<MachineOperand> Cond,
+ Register DstReg, Register TrueReg,
+ Register FalseReg, int &CondCycles,
+ int &TrueCycles, int &FalseCycles) const {
// Not all subtargets have cmov instructions.
if (!Subtarget.hasCMov())
return false;
@@ -2865,9 +3338,9 @@ canInsertSelect(const MachineBasicBlock &MBB,
void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DstReg,
- ArrayRef<MachineOperand> Cond, unsigned TrueReg,
- unsigned FalseReg) const {
+ const DebugLoc &DL, Register DstReg,
+ ArrayRef<MachineOperand> Cond, Register TrueReg,
+ Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
@@ -3189,8 +3662,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
}
}
-bool X86InstrInfo::getMemOperandWithOffset(
- const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset,
+bool X86InstrInfo::getMemOperandsWithOffsetWidth(
+ const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
+ int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const {
const MCInstrDesc &Desc = MemOp.getDesc();
int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
@@ -3199,7 +3673,8 @@ bool X86InstrInfo::getMemOperandWithOffset(
MemRefBegin += X86II::getOperandBias(Desc);
- BaseOp = &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
+ const MachineOperand *BaseOp =
+ &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
if (!BaseOp->isReg()) // Can be an MO_FrameIndex
return false;
@@ -3221,6 +3696,13 @@ bool X86InstrInfo::getMemOperandWithOffset(
if (!BaseOp->isReg())
return false;
+ OffsetIsScalable = false;
+ // FIXME: Relying on memoperands() may not be right thing to do here. Check
+ // with X86 maintainers, and fix it accordingly. For now, it is ok, since
+ // there is no use of `Width` for X86 back-end at the moment.
+ Width =
+ !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
+ BaseOps.push_back(BaseOp);
return true;
}
@@ -3241,7 +3723,7 @@ static unsigned getLoadRegOpcode(unsigned DestReg,
void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- unsigned SrcReg, bool isKill, int FrameIdx,
+ Register SrcReg, bool isKill, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
@@ -3249,7 +3731,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
"Stack slot too small for store");
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
@@ -3258,20 +3740,20 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- unsigned DestReg, int FrameIdx,
+ Register DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
bool isAligned =
- (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+ (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
}
-bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &SrcReg2, int &CmpMask,
+bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
int &CmpValue) const {
switch (MI.getOpcode()) {
default: break;
@@ -3358,7 +3840,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
/// SrcReg, SrcRegs: register operands for FlagI.
/// ImmValue: immediate for FlagI if it takes an immediate.
inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
- unsigned SrcReg, unsigned SrcReg2,
+ Register SrcReg, Register SrcReg2,
int ImmMask, int ImmValue,
const MachineInstr &OI) {
if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
@@ -3547,8 +4029,8 @@ static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
/// Check if there exists an earlier instruction that
/// operates on the same source operands and sets flags in the same way as
/// Compare; remove Compare if possible.
-bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
- unsigned SrcReg2, int CmpMask,
+bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+ Register SrcReg2, int CmpMask,
int CmpValue,
const MachineRegisterInfo *MRI) const {
// Check whether we can replace SUB with CMP.
@@ -3875,15 +4357,15 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
const MCInstrDesc &Desc) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
- Register Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB.getReg(0);
MIB->setDesc(Desc);
// MachineInstr::addOperand() will insert explicit operands before any
// implicit operands.
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
// But we don't trust that.
- assert(MIB->getOperand(1).getReg() == Reg &&
- MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+ assert(MIB.getReg(1) == Reg &&
+ MIB.getReg(2) == Reg && "Misplaced operand");
return true;
}
@@ -3905,7 +4387,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
bool MinusOne) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
- Register Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB.getReg(0);
// Insert the XOR.
BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
@@ -3949,7 +4431,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
MIB->setDesc(TII.get(X86::POP64r));
MIB->getOperand(0)
- .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
+ .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
} else {
assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
StackAdjustment = 4;
@@ -3981,14 +4463,14 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
const TargetInstrInfo &TII) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
- Register Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB.getReg(0);
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
auto Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant;
MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
- MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
+ MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
MachineBasicBlock::iterator I = MIB.getInstr();
BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
@@ -4019,7 +4501,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
const MCInstrDesc &LoadDesc,
const MCInstrDesc &BroadcastDesc,
unsigned SubIdx) {
- Register DestReg = MIB->getOperand(0).getReg();
+ Register DestReg = MIB.getReg(0);
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(DestReg) < 16) {
// We can use a normal VEX encoded load.
@@ -4042,7 +4524,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB,
const MCInstrDesc &StoreDesc,
const MCInstrDesc &ExtractDesc,
unsigned SubIdx) {
- Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+ Register SrcReg = MIB.getReg(X86::AddrNumOperands);
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(SrcReg) < 16) {
// We can use a normal VEX encoded store.
@@ -4065,7 +4547,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
// Temporarily remove the immediate so we can add another source register.
MIB->RemoveOperand(2);
// Add the register. Don't copy the kill flag if there is one.
- MIB.addReg(MIB->getOperand(1).getReg(),
+ MIB.addReg(MIB.getReg(1),
getUndefRegState(MIB->getOperand(1).isUndef()));
// Add back the immediate.
MIB.addImm(ShiftAmt);
@@ -4085,10 +4567,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::MOV32ImmSExti8:
case X86::MOV64ImmSExti8:
return ExpandMOVImmSExti8(MIB, *this, Subtarget);
- case X86::SETB_C8r:
- return Expand2AddrUndef(MIB, get(X86::SBB8rr));
- case X86::SETB_C16r:
- return Expand2AddrUndef(MIB, get(X86::SBB16rr));
case X86::SETB_C32r:
return Expand2AddrUndef(MIB, get(X86::SBB32rr));
case X86::SETB_C64r:
@@ -4103,7 +4581,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX_SET0: {
assert(HasAVX && "AVX not supported");
const TargetRegisterInfo *TRI = &getRegisterInfo();
- Register SrcReg = MIB->getOperand(0).getReg();
+ Register SrcReg = MIB.getReg(0);
Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
MIB->getOperand(0).setReg(XReg);
Expand2AddrUndef(MIB, get(X86::VXORPSrr));
@@ -4115,7 +4593,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX512_FsFLD0SD:
case X86::AVX512_FsFLD0F128: {
bool HasVLX = Subtarget.hasVLX();
- Register SrcReg = MIB->getOperand(0).getReg();
+ Register SrcReg = MIB.getReg(0);
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
return Expand2AddrUndef(MIB,
@@ -4129,7 +4607,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0: {
bool HasVLX = Subtarget.hasVLX();
- Register SrcReg = MIB->getOperand(0).getReg();
+ Register SrcReg = MIB.getReg(0);
const TargetRegisterInfo *TRI = &getRegisterInfo();
if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
@@ -4152,14 +4630,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
case X86::AVX1_SETALLONES: {
- Register Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB.getReg(0);
// VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
MIB->setDesc(get(X86::VCMPPSYrri));
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
return true;
}
case X86::AVX512_512_SETALLONES: {
- Register Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB.getReg(0);
MIB->setDesc(get(X86::VPTERNLOGDZrri));
// VPTERNLOGD needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
@@ -4169,8 +4647,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case X86::AVX512_512_SEXT_MASK_32:
case X86::AVX512_512_SEXT_MASK_64: {
- Register Reg = MIB->getOperand(0).getReg();
- Register MaskReg = MIB->getOperand(1).getReg();
+ Register Reg = MIB.getReg(0);
+ Register MaskReg = MIB.getReg(1);
unsigned MaskState = getRegState(MIB->getOperand(1));
unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
@@ -4207,7 +4685,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::MOV32ri64: {
- Register Reg = MIB->getOperand(0).getReg();
+ Register Reg = MIB.getReg(0);
Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
MI.setDesc(get(X86::MOV32ri));
MIB->getOperand(0).setReg(Reg32);
@@ -4360,11 +4838,105 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
// Return true for any instruction the copies the high bits of the first source
// operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
+// Also returns true for instructions that have two inputs where one may
+// be undef and we want it to use the same register as the other input.
+static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
bool ForLoadFold = false) {
// Set the OpNum parameter to the first source operand.
- OpNum = 1;
switch (Opcode) {
+ case X86::MMX_PUNPCKHBWirr:
+ case X86::MMX_PUNPCKHWDirr:
+ case X86::MMX_PUNPCKHDQirr:
+ case X86::MMX_PUNPCKLBWirr:
+ case X86::MMX_PUNPCKLWDirr:
+ case X86::MMX_PUNPCKLDQirr:
+ case X86::MOVHLPSrr:
+ case X86::PACKSSWBrr:
+ case X86::PACKUSWBrr:
+ case X86::PACKSSDWrr:
+ case X86::PACKUSDWrr:
+ case X86::PUNPCKHBWrr:
+ case X86::PUNPCKLBWrr:
+ case X86::PUNPCKHWDrr:
+ case X86::PUNPCKLWDrr:
+ case X86::PUNPCKHDQrr:
+ case X86::PUNPCKLDQrr:
+ case X86::PUNPCKHQDQrr:
+ case X86::PUNPCKLQDQrr:
+ case X86::SHUFPDrri:
+ case X86::SHUFPSrri:
+ // These instructions are sometimes used with an undef first or second
+ // source. Return true here so BreakFalseDeps will assign this source to the
+ // same register as the first source to avoid a false dependency.
+ // Operand 1 of these instructions is tied so they're separate from their
+ // VEX counterparts.
+ return OpNum == 2 && !ForLoadFold;
+
+ case X86::VMOVLHPSrr:
+ case X86::VMOVLHPSZrr:
+ case X86::VPACKSSWBrr:
+ case X86::VPACKUSWBrr:
+ case X86::VPACKSSDWrr:
+ case X86::VPACKUSDWrr:
+ case X86::VPACKSSWBZ128rr:
+ case X86::VPACKUSWBZ128rr:
+ case X86::VPACKSSDWZ128rr:
+ case X86::VPACKUSDWZ128rr:
+ case X86::VPERM2F128rr:
+ case X86::VPERM2I128rr:
+ case X86::VSHUFF32X4Z256rri:
+ case X86::VSHUFF32X4Zrri:
+ case X86::VSHUFF64X2Z256rri:
+ case X86::VSHUFF64X2Zrri:
+ case X86::VSHUFI32X4Z256rri:
+ case X86::VSHUFI32X4Zrri:
+ case X86::VSHUFI64X2Z256rri:
+ case X86::VSHUFI64X2Zrri:
+ case X86::VPUNPCKHBWrr:
+ case X86::VPUNPCKLBWrr:
+ case X86::VPUNPCKHBWYrr:
+ case X86::VPUNPCKLBWYrr:
+ case X86::VPUNPCKHBWZ128rr:
+ case X86::VPUNPCKLBWZ128rr:
+ case X86::VPUNPCKHBWZ256rr:
+ case X86::VPUNPCKLBWZ256rr:
+ case X86::VPUNPCKHBWZrr:
+ case X86::VPUNPCKLBWZrr:
+ case X86::VPUNPCKHWDrr:
+ case X86::VPUNPCKLWDrr:
+ case X86::VPUNPCKHWDYrr:
+ case X86::VPUNPCKLWDYrr:
+ case X86::VPUNPCKHWDZ128rr:
+ case X86::VPUNPCKLWDZ128rr:
+ case X86::VPUNPCKHWDZ256rr:
+ case X86::VPUNPCKLWDZ256rr:
+ case X86::VPUNPCKHWDZrr:
+ case X86::VPUNPCKLWDZrr:
+ case X86::VPUNPCKHDQrr:
+ case X86::VPUNPCKLDQrr:
+ case X86::VPUNPCKHDQYrr:
+ case X86::VPUNPCKLDQYrr:
+ case X86::VPUNPCKHDQZ128rr:
+ case X86::VPUNPCKLDQZ128rr:
+ case X86::VPUNPCKHDQZ256rr:
+ case X86::VPUNPCKLDQZ256rr:
+ case X86::VPUNPCKHDQZrr:
+ case X86::VPUNPCKLDQZrr:
+ case X86::VPUNPCKHQDQrr:
+ case X86::VPUNPCKLQDQrr:
+ case X86::VPUNPCKHQDQYrr:
+ case X86::VPUNPCKLQDQYrr:
+ case X86::VPUNPCKHQDQZ128rr:
+ case X86::VPUNPCKLQDQZ128rr:
+ case X86::VPUNPCKHQDQZ256rr:
+ case X86::VPUNPCKLQDQZ256rr:
+ case X86::VPUNPCKHQDQZrr:
+ case X86::VPUNPCKLQDQZrr:
+ // These instructions are sometimes used with an undef first or second
+ // source. Return true here so BreakFalseDeps will assign this source to the
+ // same register as the first source to avoid a false dependency.
+ return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
+
case X86::VCVTSI2SSrr:
case X86::VCVTSI2SSrm:
case X86::VCVTSI2SSrr_Int:
@@ -4422,7 +4994,7 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
case X86::VCVTUSI642SDZrm_Int:
// Load folding won't effect the undef register update since the input is
// a GPR.
- return !ForLoadFold;
+ return OpNum == 1 && !ForLoadFold;
case X86::VCVTSD2SSrr:
case X86::VCVTSD2SSrm:
case X86::VCVTSD2SSrr_Int:
@@ -4521,15 +5093,13 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
case X86::VSQRTSDZrb_Int:
case X86::VSQRTSDZm:
case X86::VSQRTSDZm_Int:
- return true;
+ return OpNum == 1;
case X86::VMOVSSZrrk:
case X86::VMOVSDZrrk:
- OpNum = 3;
- return true;
+ return OpNum == 3 && !ForLoadFold;
case X86::VMOVSSZrrkz:
case X86::VMOVSDZrrkz:
- OpNum = 2;
- return true;
+ return OpNum == 2 && !ForLoadFold;
}
return false;
@@ -4552,13 +5122,17 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
unsigned
X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
const TargetRegisterInfo *TRI) const {
- if (!hasUndefRegUpdate(MI.getOpcode(), OpNum))
- return 0;
-
- const MachineOperand &MO = MI.getOperand(OpNum);
- if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) {
- return UndefRegClearance;
+ for (unsigned i = MI.getNumExplicitDefs(), e = MI.getNumExplicitOperands();
+ i != e; ++i) {
+ const MachineOperand &MO = MI.getOperand(i);
+ if (MO.isReg() && MO.isUndef() &&
+ Register::isPhysicalRegister(MO.getReg()) &&
+ hasUndefRegUpdate(MI.getOpcode(), i)) {
+ OpNum = i;
+ return UndefRegClearance;
+ }
}
+
return 0;
}
@@ -4729,7 +5303,7 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
- unsigned Size, unsigned Align) const {
+ unsigned Size, Align Alignment) const {
switch (MI.getOpcode()) {
case X86::INSERTPSrr:
case X86::VINSERTPSrr:
@@ -4745,7 +5319,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) {
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) {
int PtrOffset = SrcIdx * 4;
unsigned NewImm = (DstIdx << 4) | ZMask;
unsigned NewOpCode =
@@ -4769,7 +5343,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) {
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
unsigned NewOpCode =
(MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
(MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
@@ -4788,7 +5362,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
- if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) {
+ if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
MachineInstr *NewMI =
FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
return NewMI;
@@ -4802,8 +5376,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
MachineInstr &MI) {
- unsigned Ignored;
- if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) ||
+ if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) ||
!MI.getOperand(1).isReg())
return false;
@@ -4820,11 +5393,10 @@ static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
return VRegDef && VRegDef->isImplicitDef();
}
-
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
- unsigned Size, unsigned Align, bool AllowCommute) const {
+ unsigned Size, Align Alignment, bool AllowCommute) const {
bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
bool isTwoAddrFold = false;
@@ -4864,8 +5436,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineInstr *NewMI = nullptr;
// Attempt to fold any custom cases we have.
- if (MachineInstr *CustomMI =
- foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+ if (MachineInstr *CustomMI = foldMemoryOperandCustom(
+ MF, MI, OpNum, MOs, InsertPt, Size, Alignment))
return CustomMI;
const X86MemoryFoldTableEntry *I = nullptr;
@@ -4892,9 +5464,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (I != nullptr) {
unsigned Opcode = I->DstOp;
- unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
- MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0;
- if (Align < MinAlign)
+ MaybeAlign MinAlign =
+ decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
+ if (MinAlign && Alignment < *MinAlign)
return nullptr;
bool NarrowToMOV32rm = false;
if (Size) {
@@ -4969,8 +5541,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
}
// Attempt to fold with the commuted version of the instruction.
- NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
- Size, Align, /*AllowCommute=*/false);
+ NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
+ Alignment, /*AllowCommute=*/false);
if (NewMI)
return NewMI;
@@ -5024,12 +5596,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Size = MFI.getObjectSize(FrameIndex);
- unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
+ Align Alignment = MFI.getObjectAlign(FrameIndex);
// If the function stack isn't realigned we don't want to fold instructions
// that need increased alignment.
if (!RI.needsStackRealignment(MF))
Alignment =
- std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
+ std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
unsigned NewOpc = 0;
unsigned RCSize = 0;
@@ -5087,12 +5659,31 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// destination register is wider than 32 bits (4 bytes), and its user
// instruction isn't scalar (SS).
switch (UserOpc) {
+ case X86::CVTSS2SDrr_Int:
+ case X86::VCVTSS2SDrr_Int:
+ case X86::VCVTSS2SDZrr_Int:
+ case X86::VCVTSS2SDZrr_Intk:
+ case X86::VCVTSS2SDZrr_Intkz:
+ case X86::CVTSS2SIrr_Int: case X86::CVTSS2SI64rr_Int:
+ case X86::VCVTSS2SIrr_Int: case X86::VCVTSS2SI64rr_Int:
+ case X86::VCVTSS2SIZrr_Int: case X86::VCVTSS2SI64Zrr_Int:
+ case X86::CVTTSS2SIrr_Int: case X86::CVTTSS2SI64rr_Int:
+ case X86::VCVTTSS2SIrr_Int: case X86::VCVTTSS2SI64rr_Int:
+ case X86::VCVTTSS2SIZrr_Int: case X86::VCVTTSS2SI64Zrr_Int:
+ case X86::VCVTSS2USIZrr_Int: case X86::VCVTSS2USI64Zrr_Int:
+ case X86::VCVTTSS2USIZrr_Int: case X86::VCVTTSS2USI64Zrr_Int:
+ case X86::RCPSSr_Int: case X86::VRCPSSr_Int:
+ case X86::RSQRTSSr_Int: case X86::VRSQRTSSr_Int:
+ case X86::ROUNDSSr_Int: case X86::VROUNDSSr_Int:
+ case X86::COMISSrr_Int: case X86::VCOMISSrr_Int: case X86::VCOMISSZrr_Int:
+ case X86::UCOMISSrr_Int:case X86::VUCOMISSrr_Int:case X86::VUCOMISSZrr_Int:
case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int:
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
+ case X86::SQRTSSr_Int: case X86::VSQRTSSr_Int: case X86::VSQRTSSZr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
case X86::VCMPSSZrr_Intk:
@@ -5100,6 +5691,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
+ case X86::VSQRTSSZr_Intk: case X86::VSQRTSSZr_Intkz:
case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
@@ -5127,6 +5719,41 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz:
case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz:
case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz:
+ case X86::VFIXUPIMMSSZrri:
+ case X86::VFIXUPIMMSSZrrik:
+ case X86::VFIXUPIMMSSZrrikz:
+ case X86::VFPCLASSSSZrr:
+ case X86::VFPCLASSSSZrrk:
+ case X86::VGETEXPSSZr:
+ case X86::VGETEXPSSZrk:
+ case X86::VGETEXPSSZrkz:
+ case X86::VGETMANTSSZrri:
+ case X86::VGETMANTSSZrrik:
+ case X86::VGETMANTSSZrrikz:
+ case X86::VRANGESSZrri:
+ case X86::VRANGESSZrrik:
+ case X86::VRANGESSZrrikz:
+ case X86::VRCP14SSZrr:
+ case X86::VRCP14SSZrrk:
+ case X86::VRCP14SSZrrkz:
+ case X86::VRCP28SSZr:
+ case X86::VRCP28SSZrk:
+ case X86::VRCP28SSZrkz:
+ case X86::VREDUCESSZrri:
+ case X86::VREDUCESSZrrik:
+ case X86::VREDUCESSZrrikz:
+ case X86::VRNDSCALESSZr_Int:
+ case X86::VRNDSCALESSZr_Intk:
+ case X86::VRNDSCALESSZr_Intkz:
+ case X86::VRSQRT14SSZrr:
+ case X86::VRSQRT14SSZrrk:
+ case X86::VRSQRT14SSZrrkz:
+ case X86::VRSQRT28SSZr:
+ case X86::VRSQRT28SSZrk:
+ case X86::VRSQRT28SSZrkz:
+ case X86::VSCALEFSSZrr:
+ case X86::VSCALEFSSZrrk:
+ case X86::VSCALEFSSZrrkz:
return false;
default:
return true;
@@ -5141,12 +5768,29 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// destination register is wider than 64 bits (8 bytes), and its user
// instruction isn't scalar (SD).
switch (UserOpc) {
+ case X86::CVTSD2SSrr_Int:
+ case X86::VCVTSD2SSrr_Int:
+ case X86::VCVTSD2SSZrr_Int:
+ case X86::VCVTSD2SSZrr_Intk:
+ case X86::VCVTSD2SSZrr_Intkz:
+ case X86::CVTSD2SIrr_Int: case X86::CVTSD2SI64rr_Int:
+ case X86::VCVTSD2SIrr_Int: case X86::VCVTSD2SI64rr_Int:
+ case X86::VCVTSD2SIZrr_Int: case X86::VCVTSD2SI64Zrr_Int:
+ case X86::CVTTSD2SIrr_Int: case X86::CVTTSD2SI64rr_Int:
+ case X86::VCVTTSD2SIrr_Int: case X86::VCVTTSD2SI64rr_Int:
+ case X86::VCVTTSD2SIZrr_Int: case X86::VCVTTSD2SI64Zrr_Int:
+ case X86::VCVTSD2USIZrr_Int: case X86::VCVTSD2USI64Zrr_Int:
+ case X86::VCVTTSD2USIZrr_Int: case X86::VCVTTSD2USI64Zrr_Int:
+ case X86::ROUNDSDr_Int: case X86::VROUNDSDr_Int:
+ case X86::COMISDrr_Int: case X86::VCOMISDrr_Int: case X86::VCOMISDZrr_Int:
+ case X86::UCOMISDrr_Int:case X86::VUCOMISDrr_Int:case X86::VUCOMISDZrr_Int:
case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int:
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
+ case X86::SQRTSDr_Int: case X86::VSQRTSDr_Int: case X86::VSQRTSDZr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
case X86::VCMPSDZrr_Intk:
@@ -5154,6 +5798,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
+ case X86::VSQRTSDZr_Intk: case X86::VSQRTSDZr_Intkz:
case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
@@ -5181,6 +5826,41 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz:
case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz:
case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz:
+ case X86::VFIXUPIMMSDZrri:
+ case X86::VFIXUPIMMSDZrrik:
+ case X86::VFIXUPIMMSDZrrikz:
+ case X86::VFPCLASSSDZrr:
+ case X86::VFPCLASSSDZrrk:
+ case X86::VGETEXPSDZr:
+ case X86::VGETEXPSDZrk:
+ case X86::VGETEXPSDZrkz:
+ case X86::VGETMANTSDZrri:
+ case X86::VGETMANTSDZrrik:
+ case X86::VGETMANTSDZrrikz:
+ case X86::VRANGESDZrri:
+ case X86::VRANGESDZrrik:
+ case X86::VRANGESDZrrikz:
+ case X86::VRCP14SDZrr:
+ case X86::VRCP14SDZrrk:
+ case X86::VRCP14SDZrrkz:
+ case X86::VRCP28SDZr:
+ case X86::VRCP28SDZrk:
+ case X86::VRCP28SDZrkz:
+ case X86::VREDUCESDZrri:
+ case X86::VREDUCESDZrrik:
+ case X86::VREDUCESDZrrikz:
+ case X86::VRNDSCALESDZr_Int:
+ case X86::VRNDSCALESDZr_Intk:
+ case X86::VRNDSCALESDZr_Intkz:
+ case X86::VRSQRT14SDZrr:
+ case X86::VRSQRT14SDZrrk:
+ case X86::VRSQRT14SDZrrkz:
+ case X86::VRSQRT28SDZr:
+ case X86::VRSQRT28SDZrk:
+ case X86::VRSQRT28SDZrkz:
+ case X86::VSCALEFSDZrr:
+ case X86::VSCALEFSDZrrk:
+ case X86::VSCALEFSDZrrkz:
return false;
default:
return true;
@@ -5221,36 +5901,36 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
return nullptr;
// Determine the alignment of the load.
- unsigned Alignment = 0;
+ Align Alignment;
if (LoadMI.hasOneMemOperand())
- Alignment = (*LoadMI.memoperands_begin())->getAlignment();
+ Alignment = (*LoadMI.memoperands_begin())->getAlign();
else
switch (LoadMI.getOpcode()) {
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
- Alignment = 64;
+ Alignment = Align(64);
break;
case X86::AVX2_SETALLONES:
case X86::AVX1_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_256_SET0:
- Alignment = 32;
+ Alignment = Align(32);
break;
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX512_128_SET0:
case X86::FsFLD0F128:
case X86::AVX512_FsFLD0F128:
- Alignment = 16;
+ Alignment = Align(16);
break;
case X86::MMX_SET0:
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
- Alignment = 8;
+ Alignment = Align(8);
break;
case X86::FsFLD0SS:
case X86::AVX512_FsFLD0SS:
- Alignment = 4;
+ Alignment = Align(4);
break;
default:
return nullptr;
@@ -5325,14 +6005,18 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
Ty = Type::getFP128Ty(MF.getFunction().getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
- Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16);
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
- Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8);
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 8);
else if (Opc == X86::MMX_SET0)
- Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2);
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 2);
else
- Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4);
+ Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+ 4);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
Opc == X86::AVX512_512_SETALLONES ||
@@ -5418,33 +6102,33 @@ static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I,
case TB_BCAST_D:
switch (SpillSize) {
default: llvm_unreachable("Unknown spill size");
- case 16: return X86::VPBROADCASTDZ128m;
- case 32: return X86::VPBROADCASTDZ256m;
- case 64: return X86::VPBROADCASTDZm;
+ case 16: return X86::VPBROADCASTDZ128rm;
+ case 32: return X86::VPBROADCASTDZ256rm;
+ case 64: return X86::VPBROADCASTDZrm;
}
break;
case TB_BCAST_Q:
switch (SpillSize) {
default: llvm_unreachable("Unknown spill size");
- case 16: return X86::VPBROADCASTQZ128m;
- case 32: return X86::VPBROADCASTQZ256m;
- case 64: return X86::VPBROADCASTQZm;
+ case 16: return X86::VPBROADCASTQZ128rm;
+ case 32: return X86::VPBROADCASTQZ256rm;
+ case 64: return X86::VPBROADCASTQZrm;
}
break;
case TB_BCAST_SS:
switch (SpillSize) {
default: llvm_unreachable("Unknown spill size");
- case 16: return X86::VBROADCASTSSZ128m;
- case 32: return X86::VBROADCASTSSZ256m;
- case 64: return X86::VBROADCASTSSZm;
+ case 16: return X86::VBROADCASTSSZ128rm;
+ case 32: return X86::VBROADCASTSSZ256rm;
+ case 64: return X86::VBROADCASTSSZrm;
}
break;
case TB_BCAST_SD:
switch (SpillSize) {
default: llvm_unreachable("Unknown spill size");
case 16: return X86::VMOVDDUPZ128rm;
- case 32: return X86::VBROADCASTSDZ256m;
- case 64: return X86::VBROADCASTSDZm;
+ case 32: return X86::VBROADCASTSDZ256rm;
+ case 64: return X86::VBROADCASTSDZrm;
}
break;
}
@@ -5504,7 +6188,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
Opc = getBroadcastOpcode(I, RC, Subtarget);
} else {
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
}
@@ -5581,7 +6265,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@@ -5648,7 +6332,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
Opc = getBroadcastOpcode(I, RC, Subtarget);
} else {
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
}
@@ -5714,7 +6398,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
- bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+ bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
SDNode *Store =
DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
dl, MVT::Other, AddrOps);
@@ -6124,18 +6808,18 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVSDZrm_alt, X86::VMOVSDZrm_alt, X86::VMOVQI2PQIZrm },
{ X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
{ X86::VMOVSSZrm_alt, X86::VMOVSSZrm_alt, X86::VMOVDI2PDIZrm },
- { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
- { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
- { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
- { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
- { X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
- { X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
- { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128r },
- { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128m },
- { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
- { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
- { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
- { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm },
+ { X86::VBROADCASTSSZ128rr,X86::VBROADCASTSSZ128rr,X86::VPBROADCASTDZ128rr },
+ { X86::VBROADCASTSSZ128rm,X86::VBROADCASTSSZ128rm,X86::VPBROADCASTDZ128rm },
+ { X86::VBROADCASTSSZ256rr,X86::VBROADCASTSSZ256rr,X86::VPBROADCASTDZ256rr },
+ { X86::VBROADCASTSSZ256rm,X86::VBROADCASTSSZ256rm,X86::VPBROADCASTDZ256rm },
+ { X86::VBROADCASTSSZrr, X86::VBROADCASTSSZrr, X86::VPBROADCASTDZrr },
+ { X86::VBROADCASTSSZrm, X86::VBROADCASTSSZrm, X86::VPBROADCASTDZrm },
+ { X86::VMOVDDUPZ128rr, X86::VMOVDDUPZ128rr, X86::VPBROADCASTQZ128rr },
+ { X86::VMOVDDUPZ128rm, X86::VMOVDDUPZ128rm, X86::VPBROADCASTQZ128rm },
+ { X86::VBROADCASTSDZ256rr,X86::VBROADCASTSDZ256rr,X86::VPBROADCASTQZ256rr },
+ { X86::VBROADCASTSDZ256rm,X86::VBROADCASTSDZ256rm,X86::VPBROADCASTQZ256rm },
+ { X86::VBROADCASTSDZrr, X86::VBROADCASTSDZrr, X86::VPBROADCASTQZrr },
+ { X86::VBROADCASTSDZrm, X86::VBROADCASTSDZrm, X86::VPBROADCASTQZrm },
{ X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrr, X86::VINSERTI32x4Zrr },
{ X86::VINSERTF32x4Zrm, X86::VINSERTF32x4Zrm, X86::VINSERTI32x4Zrm },
{ X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrr, X86::VINSERTI32x8Zrr },
@@ -6895,7 +7579,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
// Don't change integer Q instructions to D instructions and
- // use D intructions if we started with a PS instruction.
+ // use D instructions if we started with a PS instruction.
if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
Domain = 4;
}
@@ -7552,7 +8236,8 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
case X86::VMULSSrr:
case X86::VMULSDZrr:
case X86::VMULSSZrr:
- return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+ return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+ Inst.getFlag(MachineInstr::MIFlag::FmNsz);
default:
return false;
}
@@ -7679,6 +8364,10 @@ X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const {
return ParamLoadedValue(*Op, Expr);;
}
+ case X86::MOV8ri:
+ case X86::MOV16ri:
+ // TODO: Handle MOV8ri and MOV16ri.
+ return None;
case X86::MOV32ri:
case X86::MOV64ri:
case X86::MOV64ri32:
@@ -7738,6 +8427,20 @@ void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
MachineInstr &OldMI2,
MachineInstr &NewMI1,
MachineInstr &NewMI2) const {
+ // Propagate FP flags from the original instructions.
+ // But clear poison-generating flags because those may not be valid now.
+ // TODO: There should be a helper function for copying only fast-math-flags.
+ uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags();
+ NewMI1.setFlags(IntersectedFlags);
+ NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap);
+ NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap);
+ NewMI1.clearFlag(MachineInstr::MIFlag::IsExact);
+
+ NewMI2.setFlags(IntersectedFlags);
+ NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap);
+ NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap);
+ NewMI2.clearFlag(MachineInstr::MIFlag::IsExact);
+
// Integer instructions may define an implicit EFLAGS dest register operand.
MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS);
MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS);
@@ -7957,8 +8660,7 @@ namespace {
}
// Visit the children of this block in the dominator tree.
- for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
- I != E; ++I) {
+ for (auto I = Node->begin(), E = Node->end(); I != E; ++I) {
Changed |= VisitNode(*I, TLSBaseAddrReg);
}
@@ -8073,6 +8775,35 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
return Sum + 1;
});
+ // We check to see if CFI Instructions are present, and if they are
+ // we find the number of CFI Instructions in the candidates.
+ unsigned CFICount = 0;
+ MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
+ for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
+ Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
+ const std::vector<MCCFIInstruction> &CFIInstructions =
+ RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
+ if (MBBI->isCFIInstruction()) {
+ unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
+ MCCFIInstruction CFI = CFIInstructions[CFIIndex];
+ CFICount++;
+ }
+ MBBI++;
+ }
+
+ // We compare the number of found CFI Instructions to the number of CFI
+ // instructions in the parent function for each candidate. We must check this
+ // since if we outline one of the CFI instructions in a function, we have to
+ // outline them all for correctness. If we do not, the address offsets will be
+ // incorrect between the two sections of the program.
+ for (outliner::Candidate &C : RepeatedSequenceLocs) {
+ std::vector<MCCFIInstruction> CFIInstructions =
+ C.getMF()->getFrameInstructions();
+
+ if (CFICount > 0 && CFICount != CFIInstructions.size())
+ return outliner::OutlinedFunction();
+ }
+
// FIXME: Use real size in bytes for call and ret instructions.
if (RepeatedSequenceLocs[0].back()->isTerminator()) {
for (outliner::Candidate &C : RepeatedSequenceLocs)
@@ -8084,6 +8815,9 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
);
}
+ if (CFICount > 0)
+ return outliner::OutlinedFunction();
+
for (outliner::Candidate &C : RepeatedSequenceLocs)
C.setCallInfo(MachineOutlinerDefault, 1);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
index 1d2da5305357..89f2ff118c37 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h
@@ -24,8 +24,6 @@
#include "X86GenInstrInfo.inc"
namespace llvm {
-class MachineInstrBuilder;
-class X86RegisterInfo;
class X86Subtarget;
namespace X86 {
@@ -180,8 +178,37 @@ public:
/// true, then it's expected the pre-extension value is available as a subreg
/// of the result register. This also returns the sub-register index in
/// SubIdx.
- bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &DstReg, unsigned &SubIdx) const override;
+ bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg,
+ Register &DstReg, unsigned &SubIdx) const override;
+
+ /// Returns true if the instruction has no behavior (specified or otherwise)
+ /// that is based on the value of any of its register operands
+ ///
+ /// Instructions are considered data invariant even if they set EFLAGS.
+ ///
+ /// A classical example of something that is inherently not data invariant is
+ /// an indirect jump -- the destination is loaded into icache based on the
+ /// bits set in the jump destination register.
+ ///
+ /// FIXME: This should become part of our instruction tables.
+ static bool isDataInvariant(MachineInstr &MI);
+
+ /// Returns true if the instruction has no behavior (specified or otherwise)
+ /// that is based on the value loaded from memory or the value of any
+ /// non-address register operands.
+ ///
+ /// For example, if the latency of the instruction is dependent on the
+ /// particular bits set in any of the registers *or* any of the bits loaded
+ /// from memory.
+ ///
+ /// Instructions are considered data invariant even if they set EFLAGS.
+ ///
+ /// A classical example of something that is inherently not data invariant is
+ /// an indirect jump -- the destination is loaded into icache based on the
+ /// bits set in the jump destination register.
+ ///
+ /// FIXME: This should become part of our instruction tables.
+ static bool isDataInvariantLoad(MachineInstr &MI);
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
@@ -208,7 +235,7 @@ public:
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- unsigned DestReg, unsigned SubIdx,
+ Register DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const override;
@@ -278,7 +305,6 @@ public:
const X86InstrFMA3Group &FMA3Group) const;
// Branch analysis.
- bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
bool isUnconditionalTailCall(const MachineInstr &MI) const override;
bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
const MachineInstr &TailCall) const override;
@@ -291,10 +317,11 @@ public:
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
- bool getMemOperandWithOffset(const MachineInstr &LdSt,
- const MachineOperand *&BaseOp,
- int64_t &Offset,
- const TargetRegisterInfo *TRI) const override;
+ bool getMemOperandsWithOffsetWidth(
+ const MachineInstr &LdSt,
+ SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+ bool &OffsetIsScalable, unsigned &Width,
+ const TargetRegisterInfo *TRI) const override;
bool analyzeBranchPredicate(MachineBasicBlock &MBB,
TargetInstrInfo::MachineBranchPredicate &MBP,
bool AllowModify = false) const override;
@@ -306,22 +333,23 @@ public:
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
- unsigned, unsigned, int &, int &, int &) const override;
+ Register, Register, Register, int &, int &,
+ int &) const override;
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const DebugLoc &DL, unsigned DstReg,
- ArrayRef<MachineOperand> Cond, unsigned TrueReg,
- unsigned FalseReg) const override;
+ const DebugLoc &DL, Register DstReg,
+ ArrayRef<MachineOperand> Cond, Register TrueReg,
+ Register FalseReg) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, unsigned SrcReg,
+ MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI, unsigned DestReg,
+ MachineBasicBlock::iterator MI, Register DestReg,
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
@@ -443,7 +471,7 @@ public:
unsigned OpNum,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
- unsigned Size, unsigned Alignment,
+ unsigned Size, Align Alignment,
bool AllowCommute) const;
bool isHighLatencyDef(int opc) const override;
@@ -469,15 +497,15 @@ public:
/// in SrcReg and SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
/// can be analyzed.
- bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
- unsigned &SrcReg2, int &CmpMask,
+ bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+ Register &SrcReg2, int &CmpMask,
int &CmpValue) const override;
/// optimizeCompareInstr - Check if there exists an earlier instruction that
/// operates on the same source operands and sets flags in the same way as
/// Compare; remove Compare if possible.
- bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
- unsigned SrcReg2, int CmpMask, int CmpValue,
+ bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+ Register SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const override;
/// optimizeLoadInstr - Try to remove the load by folding it to a register
@@ -563,7 +591,7 @@ private:
unsigned OpNum,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
- unsigned Size, unsigned Align) const;
+ unsigned Size, Align Alignment) const;
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
index 93f40c8ec996..23841c3d7e50 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.td
@@ -16,10 +16,10 @@
// X86 specific DAG Nodes.
//
-def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
-
-def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
-//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def SDTX86FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>,
+ SDTCisSameAs<1, 2>]>;
def SDTX86Cmov : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
@@ -121,6 +121,8 @@ def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+def SDT_X86PROBED_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
@@ -138,12 +140,13 @@ def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>;
def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>;
-def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
-def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
+def X86fshl : SDNode<"X86ISD::FSHL", SDTIntShiftDOp>;
+def X86fshr : SDNode<"X86ISD::FSHR", SDTIntShiftDOp>;
def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
-def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86CmpTest, [SDNPHasChain]>;
-def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86CmpTest, [SDNPHasChain]>;
+def X86fcmp : SDNode<"X86ISD::FCMP", SDTX86FCmp>;
+def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86FCmp, [SDNPHasChain]>;
+def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86FCmp, [SDNPHasChain]>;
def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
@@ -152,8 +155,6 @@ def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>;
def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
-def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>;
-
def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand,
[SDNPHasChain, SDNPSideEffect]>;
@@ -286,6 +287,9 @@ def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>;
+def X86pdep : SDNode<"X86ISD::PDEP", SDTIntBinOp>;
+def X86pext : SDNode<"X86ISD::PEXT", SDTIntBinOp>;
+
def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@@ -294,6 +298,9 @@ def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
[SDNPHasChain]>;
+def X86ProbedAlloca : SDNode<"X86ISD::PROBED_ALLOCA", SDT_X86PROBED_ALLOCA,
+ [SDNPHasChain]>;
+
def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
@@ -354,6 +361,8 @@ let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; }
def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; }
+
+ def X86SibMemOperand : AsmOperandClass { let Name = "SibMem"; }
}
def X86AbsMemAsmOperand : AsmOperandClass {
@@ -376,14 +385,16 @@ class X86VMemOperand<RegisterClass RC, string printMethod,
let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
}
-def anymem : X86MemOperand<"printanymem">;
+def anymem : X86MemOperand<"printMemReference">;
def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
[(X86strict_fcmp node:$lhs, node:$rhs),
- (X86cmp node:$lhs, node:$rhs)]>;
+ (X86fcmp node:$lhs, node:$rhs)]>;
// FIXME: Right now we allow any size during parsing, but we might want to
// restrict to only unsized memory.
-def opaquemem : X86MemOperand<"printopaquemem">;
+def opaquemem : X86MemOperand<"printMemReference">;
+
+def sibmem: X86MemOperand<"printMemReference", X86SibMemOperand>;
def i8mem : X86MemOperand<"printbytemem", X86Mem8AsmOperand>;
def i16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
@@ -757,14 +768,14 @@ def i64u8imm : Operand<i64> {
}
def lea64_32mem : Operand<i32> {
- let PrintMethod = "printanymem";
+ let PrintMethod = "printMemReference";
let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
let ParserMatchClass = X86MemAsmOperand;
}
// Memory operands that use 64-bit pointers in both ILP32 and LP64.
def lea64mem : Operand<i64> {
- let PrintMethod = "printanymem";
+ let PrintMethod = "printMemReference";
let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
let ParserMatchClass = X86MemAsmOperand;
}
@@ -830,11 +841,10 @@ def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
-// A relocatable immediate is either an immediate operand or an operand that can
-// be relocated by the linker to an immediate, such as a regular symbol in
-// non-PIC code.
-def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [],
- 0>;
+// A relocatable immediate is an operand that can be relocated by the linker to
+// an immediate, such as a regular symbol in non-PIC code.
+def relocImm : ComplexPattern<iAny, 1, "selectRelocImm",
+ [X86Wrapper], [], 0>;
//===----------------------------------------------------------------------===//
// X86 Instruction Predicate Definitions.
@@ -922,11 +932,10 @@ def HasRTM : Predicate<"Subtarget->hasRTM()">;
def HasADX : Predicate<"Subtarget->hasADX()">;
def HasSHA : Predicate<"Subtarget->hasSHA()">;
def HasSGX : Predicate<"Subtarget->hasSGX()">;
-def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">;
def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">;
def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
-def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">;
def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">;
def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">;
@@ -948,18 +957,23 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">;
def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">;
+def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">;
+def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">;
+def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">;
+def HasAMXBF16 : Predicate<"Subtarget->hasAMXBF16()">;
+def HasAMXINT8 : Predicate<"Subtarget->hasAMXINT8()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
- AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
+ AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
def In64BitMode : Predicate<"Subtarget->is64Bit()">,
- AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+ AssemblerPredicate<(all_of Mode64Bit), "64-bit mode">;
def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">;
def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
def In16BitMode : Predicate<"Subtarget->is16Bit()">,
- AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+ AssemblerPredicate<(all_of Mode16Bit), "16-bit mode">;
def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
- AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+ AssemblerPredicate<(all_of (not Mode16Bit)), "Not 16-bit mode">;
def In32BitMode : Predicate<"Subtarget->is32Bit()">,
- AssemblerPredicate<"Mode32Bit", "32-bit mode">;
+ AssemblerPredicate<(all_of Mode32Bit), "32-bit mode">;
def IsWin64 : Predicate<"Subtarget->isTargetWin64()">;
def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">;
def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
@@ -1033,13 +1047,17 @@ def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
-// FIXME: Ideally we would just replace the above i*immSExt* matchers with
-// relocImm-based matchers, but then FastISel would be unable to use them.
+def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{
+ return isSExtAbsoluteSymbolRef(8, N);
+}]>;
+def i32relocImmSExt8 : PatLeaf<(i32 relocImm), [{
+ return isSExtAbsoluteSymbolRef(8, N);
+}]>;
def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
- return isSExtRelocImm<8>(N);
+ return isSExtAbsoluteSymbolRef(8, N);
}]>;
def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
- return isSExtRelocImm<32>(N);
+ return isSExtAbsoluteSymbolRef(32, N);
}]>;
// If we have multiple users of an immediate, it's much smaller to reuse
@@ -1059,6 +1077,13 @@ def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
// Eventually, it would be nice to allow ConstantHoisting to merge constants
// globally for potentially added savings.
//
+def imm_su : PatLeaf<(imm), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
def relocImm8_su : PatLeaf<(i8 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
@@ -1069,20 +1094,26 @@ def relocImm32_su : PatLeaf<(i32 relocImm), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+def i16relocImmSExt8_su : PatLeaf<(i16relocImmSExt8), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+def i32relocImmSExt8_su : PatLeaf<(i32relocImmSExt8), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
+def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
-def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+ return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
return !shouldAvoidImmediateInstFormsForSize(N);
}]>;
@@ -1113,7 +1144,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
ISD::LoadExtType ExtType = LD->getExtensionType();
if (ExtType == ISD::NON_EXTLOAD)
return true;
- if (ExtType == ISD::EXTLOAD)
+ if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad)
return LD->getAlignment() >= 2 && LD->isSimple();
return false;
}]>;
@@ -1123,7 +1154,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
ISD::LoadExtType ExtType = LD->getExtensionType();
if (ExtType == ISD::NON_EXTLOAD)
return true;
- if (ExtType == ISD::EXTLOAD)
+ if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad)
return LD->getAlignment() >= 4 && LD->isSimple();
return false;
}]>;
@@ -1550,7 +1581,7 @@ def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
[(set GR16:$dst, imm:$src)]>, OpSize16;
def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, relocImm:$src)]>, OpSize32;
+ [(set GR32:$dst, imm:$src)]>, OpSize32;
def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, i64immSExt32:$src)]>;
@@ -1558,7 +1589,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
let isReMaterializable = 1, isMoveImm = 1 in {
def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
"movabs{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, relocImm:$src)]>;
+ [(set GR64:$dst, imm:$src)]>;
}
// Longer forms that use a ModR/M byte. Needed for disassembler
@@ -1578,19 +1609,31 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
let SchedRW = [WriteStore] in {
def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
"mov{b}\t{$src, $dst|$dst, $src}",
- [(store (i8 relocImm8_su:$src), addr:$dst)]>;
+ [(store (i8 imm_su:$src), addr:$dst)]>;
def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
"mov{w}\t{$src, $dst|$dst, $src}",
- [(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16;
+ [(store (i16 imm_su:$src), addr:$dst)]>, OpSize16;
def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
"mov{l}\t{$src, $dst|$dst, $src}",
- [(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32;
+ [(store (i32 imm_su:$src), addr:$dst)]>, OpSize32;
def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store i64relocImmSExt32_su:$src, addr:$dst)]>,
+ [(store i64immSExt32_su:$src, addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
+def : Pat<(i32 relocImm:$src), (MOV32ri relocImm:$src)>;
+def : Pat<(i64 relocImm:$src), (MOV64ri relocImm:$src)>;
+
+def : Pat<(store (i8 relocImm8_su:$src), addr:$dst),
+ (MOV8mi addr:$dst, relocImm8_su:$src)>;
+def : Pat<(store (i16 relocImm16_su:$src), addr:$dst),
+ (MOV16mi addr:$dst, relocImm16_su:$src)>;
+def : Pat<(store (i32 relocImm32_su:$src), addr:$dst),
+ (MOV32mi addr:$dst, relocImm32_su:$src)>;
+def : Pat<(store (i64 i64relocImmSExt32_su:$src), addr:$dst),
+ (MOV64mi32 addr:$dst, i64immSExt32_su:$src)>;
+
let hasSideEffects = 0 in {
/// Memory offset versions of moves. The immediate is an address mode sized
@@ -1787,9 +1830,8 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
// Condition code ops, incl. set if equal/not equal/...
let SchedRW = [WriteLAHFSAHF] in {
-let Defs = [EFLAGS], Uses = [AH] in
-def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf",
- [(set EFLAGS, (X86sahf AH))]>,
+let Defs = [EFLAGS], Uses = [AH], hasSideEffects = 0 in
+def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", []>, // flags = AH
Requires<[HasLAHFSAHF]>;
let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags
@@ -2163,24 +2205,24 @@ def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
// Lock instruction prefix
let SchedRW = [WriteMicrocoded] in
-def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;
+def LOCK_PREFIX : I<0xF0, PrefixByte, (outs), (ins), "lock", []>;
let SchedRW = [WriteNop] in {
// Rex64 instruction prefix
-def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>,
+def REX64_PREFIX : I<0x48, PrefixByte, (outs), (ins), "rex64", []>,
Requires<[In64BitMode]>;
// Data16 instruction prefix
-def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>;
+def DATA16_PREFIX : I<0x66, PrefixByte, (outs), (ins), "data16", []>;
} // SchedRW
// Repeat string operation instruction prefixes
let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in {
// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
-def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
+def REP_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "rep", []>;
// Repeat while not equal (used with CMPS and SCAS)
-def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
+def REPNE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "repne", []>;
}
// String manipulation instructions
@@ -2581,27 +2623,27 @@ let Predicates = [HasBMI2, NoTBM] in {
}
multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
- X86MemOperand x86memop, Intrinsic Int,
+ X86MemOperand x86memop, SDNode OpNode,
PatFrag ld_frag> {
def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
VEX_4V, Sched<[WriteALU]>;
def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>,
+ [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
}
let Predicates = [HasBMI2] in {
defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
- int_x86_bmi_pdep_32, loadi32>, T8XD;
+ X86pdep, loadi32>, T8XD;
defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
- int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+ X86pdep, loadi64>, T8XD, VEX_W;
defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
- int_x86_bmi_pext_32, loadi32>, T8XS;
+ X86pext, loadi32>, T8XS;
defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
- int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+ X86pext, loadi64>, T8XS, VEX_W;
}
//===----------------------------------------------------------------------===//
@@ -2785,11 +2827,11 @@ let SchedRW = [WriteStore] in {
def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movdiri\t{$src, $dst|$dst, $src}",
[(int_x86_directstore32 addr:$dst, GR32:$src)]>,
- T8, Requires<[HasMOVDIRI]>;
+ T8PS, Requires<[HasMOVDIRI]>;
def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"movdiri\t{$src, $dst|$dst, $src}",
[(int_x86_directstore64 addr:$dst, GR64:$src)]>,
- T8, Requires<[In64BitMode, HasMOVDIRI]>;
+ T8PS, Requires<[In64BitMode, HasMOVDIRI]>;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -2856,6 +2898,23 @@ def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
+// SERIALIZE Instruction
+//
+def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
+ [(int_x86_serialize)]>, PS,
+ Requires<[HasSERIALIZE]>;
+
+//===----------------------------------------------------------------------===//
+// TSXLDTRK - TSX Suspend Load Address Tracking
+//
+let Predicates = [HasTSXLDTRK] in {
+ def XSUSLDTRK : I<0x01, MRM_E8, (outs), (ins), "xsusldtrk",
+ [(int_x86_xsusldtrk)]>, XD;
+ def XRESLDTRK : I<0x01, MRM_E9, (outs), (ins), "xresldtrk",
+ [(int_x86_xresldtrk)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
// Pattern fragments to auto generate TBM instructions.
//===----------------------------------------------------------------------===//
@@ -2913,6 +2972,11 @@ let Predicates = [HasTBM] in {
(TZMSK64rr GR64:$src)>;
// Patterns to match flag producing ops.
+ def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, 1)),
+ (BLCFILL32rr GR32:$src)>;
+ def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, 1)),
+ (BLCFILL64rr GR64:$src)>;
+
def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
(BLCI32rr GR32:$src)>;
def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
@@ -2974,7 +3038,7 @@ def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
- [(int_x86_cldemote addr:$src)]>, TB;
+ [(int_x86_cldemote addr:$src)]>, PS;
//===----------------------------------------------------------------------===//
// Subsystems.
@@ -3013,6 +3077,9 @@ include "X86InstrSVM.td"
include "X86InstrTSX.td"
include "X86InstrSGX.td"
+// AMX instructions
+include "X86InstrAMX.td"
+
// System instructions.
include "X86InstrSystem.td"
@@ -3108,6 +3175,9 @@ def : MnemonicAlias<"smovl", "movsl", "att">;
def : MnemonicAlias<"smovq", "movsq", "att">;
def : MnemonicAlias<"ud2a", "ud2", "att">;
+def : MnemonicAlias<"ud2bw", "ud1w", "att">;
+def : MnemonicAlias<"ud2bl", "ud1l", "att">;
+def : MnemonicAlias<"ud2bq", "ud1q", "att">;
def : MnemonicAlias<"verrw", "verr", "att">;
// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release'
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
index 0f4d4d764cc9..49940204c25a 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMMX.td
@@ -24,8 +24,9 @@
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
- isPseudo = 1, SchedRW = [WriteZero] in {
-def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
+ isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasMMX] in {
+def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "",
+ [(set VR64:$dst, (x86mmx (MMX_X86movw2d (i32 0))))]>;
}
let Constraints = "$src1 = $dst" in {
@@ -43,8 +44,7 @@ let Constraints = "$src1 = $dst" in {
def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, OType:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))]>,
+ [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -60,8 +60,7 @@ let Constraints = "$src1 = $dst" in {
def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- [(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))]>,
+ [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
(ins VR64:$src1, i32u8imm:$src2),
@@ -81,8 +80,7 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR64:$dst,
- (IntId64 (bitconvert (load_mmx addr:$src))))]>,
+ [(set VR64:$dst, (IntId64 (load_mmx addr:$src)))]>,
Sched<[sched.Folded]>;
}
@@ -101,8 +99,7 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
(ins VR64:$src1, i64mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst,
- (IntId64 VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))]>,
+ (IntId64 VR64:$src1, (load_mmx addr:$src2)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -118,8 +115,8 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
(ins VR64:$src1, i64mem:$src2, u8imm:$src3),
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR64:$dst, (IntId VR64:$src1,
- (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>,
+ [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2),
+ (i8 timm:$src3)))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -164,23 +161,14 @@ def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (x86mmx (scalar_to_vector GR32:$src)))]>,
+ (x86mmx (MMX_X86movw2d GR32:$src)))]>,
Sched<[WriteVecMoveFromGpr]>;
def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
"movd\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>,
+ (x86mmx (MMX_X86movw2d (loadi32 addr:$src))))]>,
Sched<[WriteVecLoad]>;
-let Predicates = [HasMMX] in {
- def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
- (MMX_MOVD64rr GR32:$src)>;
- def : Pat<(x86mmx (MMX_X86movw2d (i32 0))),
- (MMX_SET0)>;
- def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
- (MMX_MOVD64rm addr:$src)>;
-}
-
let mayStore = 1 in
def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
"movd\t{$src, $dst|$dst, $src}", []>,
@@ -240,20 +228,21 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
"movq\t{$src, $dst|$dst, $src}",
[(store (x86mmx VR64:$src), addr:$dst)]>;
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+ [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+ [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
(ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
[(set VR64:$dst,
- (x86mmx (bitconvert
- (i64 (extractelt (v2i64 VR128:$src),
- (iPTR 0))))))]>;
+ (x86mmx (MMX_X86movdq2q VR128:$src)))]>;
def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
(ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2i64
- (scalar_to_vector
- (i64 (bitconvert (x86mmx VR64:$src))))))]>;
+ (v2i64 (MMX_X86movq2dq VR64:$src)))]>;
let isCodeGenOnly = 1, hasSideEffects = 1 in {
def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
@@ -272,14 +261,6 @@ def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
[(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>,
Sched<[SchedWriteVecMoveLSNT.MMX.MR]>;
-let Predicates = [HasMMX] in {
- // movd to MMX register zero-extends
- def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
- (MMX_MOVD64rr GR32:$src)>;
- def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
- (MMX_MOVD64rm addr:$src)>;
-}
-
// Arithmetic Instructions
defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
SchedWriteVecALU.MMX>;
@@ -566,27 +547,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(int_x86_mmx_pmovmskb VR64:$src))]>,
Sched<[WriteMMXMOVMSK]>;
-// MMX to XMM for vector types
-def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
- [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
- (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-
-// Low word of XMM to MMX.
-def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
- [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
-
-def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
- (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
-
-def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
- (x86mmx (MMX_MOVQ64rm addr:$src))>;
-
-def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
- (i64 (bitconvert (x86mmx VR64:$src)))))),
- (MMX_MOVQ2DQrr VR64:$src)>;
-
// Misc.
let SchedRW = [SchedWriteShuffle.MMX] in {
let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td
index 747f5aa86653..6439f717accb 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSGX.td
@@ -17,13 +17,13 @@
let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
// ENCLS - Execute an Enclave System Function of Specified Leaf Number
def ENCLS : I<0x01, MRM_CF, (outs), (ins),
- "encls", []>, TB;
+ "encls", []>, PS;
// ENCLU - Execute an Enclave User Function of Specified Leaf Number
def ENCLU : I<0x01, MRM_D7, (outs), (ins),
- "enclu", []>, TB;
+ "enclu", []>, PS;
// ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
def ENCLV : I<0x01, MRM_C0, (outs), (ins),
- "enclv", []>, TB;
+ "enclv", []>, PS;
} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
index c45f342ed75b..c3c9f22381f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td
@@ -43,7 +43,7 @@ let isCodeGenOnly = 1 in {
multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, RegisterClass RC,
ValueType VT, string asm, Operand memopr,
- ComplexPattern mem_cpat, Domain d,
+ PatFrags mem_frags, Domain d,
X86FoldableSchedWrite sched, bit Is2Addr = 1> {
let hasSideEffects = 0 in {
def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
@@ -57,7 +57,7 @@ let hasSideEffects = 0 in {
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
+ [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
@@ -720,11 +720,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
} // SchedRW
let Predicates = [UseAVX] in {
- // Also handle an i64 load because that may get selected as a faster way to
- // load the data.
- def : Pat<(v2f64 (X86Unpckl VR128:$src1,
- (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
- (VMOVHPDrm VR128:$src1, addr:$src2)>;
+ // MOVHPD patterns
def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
@@ -754,12 +750,6 @@ let Predicates = [UseSSE1] in {
let Predicates = [UseSSE2] in {
// MOVHPD patterns
-
- // Also handle an i64 load because that may get selected as a faster way to
- // load the data.
- def : Pat<(v2f64 (X86Unpckl VR128:$src1,
- (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
- (MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
@@ -884,6 +874,23 @@ defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf6
"cvttsd2si", "cvttsd2si",
WriteCvtSD2I, SSEPackedDouble>,
XD, VEX, VEX_W, VEX_LIG;
+
+defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>,
+ XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>,
+ XS, VEX, VEX_W, VEX_LIG;
+defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>,
+ XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>,
+ XD, VEX, VEX_W, VEX_LIG;
}
// The assembler can recognize rr 64-bit instructions by seeing a rxx
@@ -923,6 +930,12 @@ let Predicates = [UseAVX] in {
(VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
def : Pat<(f64 (any_sint_to_fp GR64:$src)),
(VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+ def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
+ def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
+
+ def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
+ def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
}
let isCodeGenOnly = 1 in {
@@ -938,6 +951,20 @@ defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
"cvttsd2si", "cvttsd2si",
WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+
+defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
+ "cvtss2si", "cvtss2si",
+ WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
+ "cvtsd2si", "cvtsd2si",
+ WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
"cvtsi2ss", "cvtsi2ss{l}",
WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
@@ -952,12 +979,22 @@ defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
} // isCodeGenOnly = 1
+let Predicates = [UseSSE1] in {
+ def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
+ def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
+ def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
+}
+
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
ValueType DstVT, ValueType SrcVT, SDNode OpNode,
- Operand memop, ComplexPattern mem_cpat, string asm,
+ Operand memop, PatFrags mem_frags, string asm,
X86FoldableSchedWrite sched, Domain d> {
let ExeDomain = d in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
@@ -966,7 +1003,7 @@ let ExeDomain = d in {
Sched<[sched]>;
def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
+ [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
Sched<[sched.Folded]>;
}
}
@@ -1247,7 +1284,7 @@ def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
- (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
+ (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
let Constraints = "$src1 = $dst" in {
@@ -1261,7 +1298,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
- (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
+ (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
XD, Requires<[UseSSE2]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
}
@@ -1745,124 +1782,94 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
- SDNode OpNode, ValueType VT,
+ Operand memop, SDNode OpNode, ValueType VT,
PatFrag ld_frag, string asm,
- X86FoldableSchedWrite sched> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
- let isCommutable = 1 in
- def rr : SIi8<0xC2, MRMSrcReg,
- (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
- [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
- Sched<[sched]>;
- def rm : SIi8<0xC2, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
- [(set RC:$dst, (OpNode (VT RC:$src1),
- (ld_frag addr:$src2), timm:$cc))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
-}
-
-let isCodeGenOnly = 1 in {
- let ExeDomain = SSEPackedSingle in
- defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
- "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
- let ExeDomain = SSEPackedDouble in
- defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
- "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SchedWriteFCmpSizes.PD.Scl>,
- XD, VEX_4V, VEX_LIG, VEX_WIG;
-
- let Constraints = "$src1 = $dst" in {
- let ExeDomain = SSEPackedSingle in
- defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
- "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SchedWriteFCmpSizes.PS.Scl>, XS;
- let ExeDomain = SSEPackedDouble in
- defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
- "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SchedWriteFCmpSizes.PD.Scl>, XD;
- }
-}
-
-multiclass sse12_cmp_scalar_int<Operand memop,
- Intrinsic Int, string asm, X86FoldableSchedWrite sched,
- ComplexPattern mem_cpat> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
+ X86FoldableSchedWrite sched,
+ PatFrags mem_frags> {
def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
- [(set VR128:$dst, (Int VR128:$src1,
- VR128:$src, timm:$cc))]>,
- Sched<[sched]>;
-let mayLoad = 1 in
+ (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
+ [(set VR128:$dst, (OpNode (VT VR128:$src1),
+ VR128:$src2, timm:$cc))]>,
+ Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in
def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
- [(set VR128:$dst, (Int VR128:$src1,
- mem_cpat:$src, timm:$cc))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+ (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
+ [(set VR128:$dst, (OpNode (VT VR128:$src1),
+ (mem_frags addr:$src2), timm:$cc))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+
+ let isCodeGenOnly = 1 in {
+ let isCommutable = 1 in
+ def rr : SIi8<0xC2, MRMSrcReg,
+ (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
+ [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
+ Sched<[sched]>, SIMD_EXC;
+ def rm : SIi8<0xC2, MRMSrcMem,
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
+ [(set RC:$dst, (OpNode RC:$src1,
+ (ld_frag addr:$src2), timm:$cc))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ }
}
-// Aliases to match intrinsics which expect XMM operand(s).
let ExeDomain = SSEPackedSingle in
-defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
- "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
- SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
- XS, VEX_4V, VEX_LIG, VEX_WIG;
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+ "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
+ XS, VEX_4V, VEX_LIG, VEX_WIG;
let ExeDomain = SSEPackedDouble in
-defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
- "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
- SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
- XD, VEX_4V, VEX_LIG, VEX_WIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+ "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
+ XD, VEX_4V, VEX_LIG, VEX_WIG;
+
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
- defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
- "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
- SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
+ defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
let ExeDomain = SSEPackedDouble in
- defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
- "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
- SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
+ defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+ "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+ SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
}
-
// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, X86MemOperand x86memop,
PatFrag ld_frag, string OpcodeStr, Domain d,
- X86FoldableSchedWrite sched = WriteFCom> {
-let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
- ExeDomain = d in {
+ X86FoldableSchedWrite sched = WriteFComX> {
+ let ExeDomain = d in {
def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
- Sched<[sched]>;
-let mayLoad = 1 in
+ Sched<[sched]>, SIMD_EXC;
+ let mayLoad = 1 in
def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
(ld_frag addr:$src2)))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, Operand memop,
- ComplexPattern mem_cpat, string OpcodeStr,
+ PatFrags mem_frags, string OpcodeStr,
Domain d,
- X86FoldableSchedWrite sched = WriteFCom> {
-let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in {
+ X86FoldableSchedWrite sched = WriteFComX> {
+let ExeDomain = d in {
def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
- Sched<[sched]>;
+ Sched<[sched]>, SIMD_EXC;
let mayLoad = 1 in
def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
- mem_cpat:$src2))]>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ (mem_frags addr:$src2)))]>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
@@ -1914,18 +1921,16 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
ValueType VT, string asm,
X86FoldableSchedWrite sched,
Domain d, PatFrag ld_frag> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
[(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
- Sched<[sched]>;
+ Sched<[sched]>, SIMD_EXC;
def rmi : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst,
(VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
@@ -2812,7 +2817,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
}
multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
- ComplexPattern int_cpat, Intrinsic Intr,
+ PatFrags mem_frags, Intrinsic Intr,
Predicate target, string Suffix> {
let Predicates = [target] in {
// These are unary operations, but they are modeled as having 2 source operands
@@ -2828,13 +2833,13 @@ multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
// which has a clobber before the rcp, vs.
// rcpss mem, %xmm0
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr int_cpat:$src2),
+ def : Pat<(Intr (mem_frags addr:$src2)),
(!cast<Instruction>(NAME#m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
}
}
-multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
+multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags,
Intrinsic Intr, Predicate target> {
let Predicates = [target] in {
def : Pat<(Intr VR128:$src),
@@ -2842,7 +2847,7 @@ multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int
VR128:$src)>;
}
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr int_cpat:$src2),
+ def : Pat<(Intr (mem_frags addr:$src2)),
(!cast<Instruction>(NAME#m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
}
@@ -2968,28 +2973,28 @@ let Predicates = [HasAVX, NoVLX] in {
multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched, Predicate AVXTarget> {
defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
- !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+ !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
UseSSE1, "SS">, XS;
defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
- !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+ !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
AVXTarget>,
XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched, Predicate AVXTarget> {
- defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
+ defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem,
ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
- defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
+ defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
XS, VEX_4V, VEX_LIG, VEX_WIG;
}
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched, Predicate AVXTarget> {
- defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
+ defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem,
sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
- defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
+ defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
XD, VEX_4V, VEX_LIG, VEX_WIG;
}
@@ -3185,13 +3190,13 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins),
let SchedRW = [WriteFence] in {
// Load, store, and memory fence
-// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
+// TODO: As with mfence, we may want to ease the availability of sfence/lfence
// to include any 64-bit target.
-def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
+def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
PS, Requires<[HasSSE1]>;
-def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
+def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
PS, Requires<[HasSSE2]>;
-def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
+def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
PS, Requires<[HasMFence]>;
} // SchedRW
@@ -3213,11 +3218,11 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
let mayLoad=1, hasSideEffects=1 in
def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
- TB, Sched<[WriteLDMXCSR]>;
+ PS, Sched<[WriteLDMXCSR]>;
let mayStore=1, hasSideEffects=1 in
def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
- TB, Sched<[WriteSTMXCSR]>;
+ PS, Sched<[WriteSTMXCSR]>;
//===---------------------------------------------------------------------===//
// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -4185,8 +4190,6 @@ let Predicates = [UseAVX] in {
// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
// These instructions also write zeros in the high part of a 256-bit register.
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
- (VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v8i32 (X86vzload32 addr:$src)),
@@ -4199,8 +4202,6 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
(MOV64toPQIrr GR64:$src)>;
- def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
- (MOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzload32 addr:$src)),
(MOVDI2PDIrm addr:$src)>;
}
@@ -4429,16 +4430,11 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
- (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
}
let Predicates = [UseSSE3] in {
- // No need for aligned memory as this only loads 64-bits.
- def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
- (MOVDDUPrm addr:$src)>;
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(MOVDDUPrm addr:$src)>;
}
@@ -5022,7 +5018,9 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
@@ -5030,12 +5028,14 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+ (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
}
}
@@ -5499,7 +5499,7 @@ let ExeDomain = SSEPackedSingle in {
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
+ (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
@@ -5522,7 +5522,7 @@ let ExeDomain = SSEPackedDouble in {
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
+ (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
}
@@ -6623,7 +6623,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
[!if(UsesXMM0,
(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
(set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
- T8, Sched<[sched]>;
+ T8PS, Sched<[sched]>;
def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
@@ -6634,7 +6634,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
(set VR128:$dst, (IntId VR128:$src1,
(memop addr:$src2), XMM0)),
(set VR128:$dst, (IntId VR128:$src1,
- (memop addr:$src2))))]>, T8,
+ (memop addr:$src2))))]>, T8PS,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -6644,7 +6644,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
"sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
- (i8 timm:$src3)))]>, TA,
+ (i8 timm:$src3)))]>, TAPS,
Sched<[SchedWriteVecIMul.XMM]>;
def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6652,7 +6652,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1,
(memop addr:$src2),
- (i8 timm:$src3)))]>, TA,
+ (i8 timm:$src3)))]>, TAPS,
Sched<[SchedWriteVecIMul.XMM.Folded,
SchedWriteVecIMul.XMM.ReadAfterFold]>;
@@ -6687,7 +6687,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
Intrinsic IntId, PatFrag ld_frag,
bit Is2Addr = 0, RegisterClass RC = VR128,
X86MemOperand MemOp = i128mem> {
- let AsmString = OpcodeStr##
+ let AsmString = OpcodeStr#
!if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
@@ -6874,10 +6874,10 @@ defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
X86MemOperand MemOp, string Hi, string Lo> {
- def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
(!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
!add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
- def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
(!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
!add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
}
@@ -7290,13 +7290,12 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
X86FoldableSchedWrite sched> {
def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
- [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
+ [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
T8PD, VEX, Sched<[sched]>;
let hasSideEffects = 0, mayLoad = 1 in
def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
- [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
- T8PD, VEX, Sched<[sched.Folded]>;
+ []>, T8PD, VEX, Sched<[sched.Folded]>;
}
multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
@@ -7304,7 +7303,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
+ [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
TAPD, VEX, Sched<[RR]>;
let hasSideEffects = 0, mayStore = 1 in
def mr : Ii8<0x1D, MRMDestMem, (outs),
@@ -7322,44 +7321,26 @@ let Predicates = [HasF16C, NoVLX] in {
WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
+ def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
+ def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
(v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTPH2PSrm addr:$src)>;
+ def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
+ (VCVTPH2PSYrm addr:$src)>;
def : Pat<(store (f64 (extractelt
- (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
+ (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
(VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
def : Pat<(store (i64 (extractelt
- (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
+ (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
(iPTR 0))), addr:$dst),
(VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
- def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
+ def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
(VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
}
-// Patterns for matching conversions from float to half-float and vice versa.
-let Predicates = [HasF16C, NoVLX] in {
- // Use MXCSR.RC for rounding instead of explicitly specifying the default
- // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
- // configurations we support (the default). However, falling back to MXCSR is
- // more consistent with other instructions, which are always controlled by it.
- // It's encoded as 0b100.
- def : Pat<(fp_to_f16 FR32:$src),
- (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
- (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
-
- def : Pat<(f16_to_fp GR16:$src),
- (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
- (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
-
- def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
- (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
- (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
-}
-
//===----------------------------------------------------------------------===//
// AVX2 Instructions
//===----------------------------------------------------------------------===//
@@ -7415,7 +7396,7 @@ def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
-// NOTE: We're using FP instructions here, but exeuction domain fixing should
+// NOTE: We're using FP instructions here, but execution domain fixing should
// take care of using integer instructions when profitable.
let Predicates = [HasAVX] in {
def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
@@ -7496,46 +7477,6 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastl
v2i64, v4i64, NoVLX>;
let Predicates = [HasAVX2, NoVLX] in {
- // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
- (VPBROADCASTQrm addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
- (VPBROADCASTQYrm addr:$src)>;
-
- // FIXME this is to handle aligned extloads from i8/i16.
- def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
- (VPBROADCASTDrm addr:$src)>;
- def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
- (VPBROADCASTDYrm addr:$src)>;
-}
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
- // This means we'll encounter truncated i32 loads; match that here.
- def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
- (VPBROADCASTWYrm addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (i32 (extloadi16 addr:$src)))))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v8i16 (X86VBroadcast
- (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (i32 (extloadi16 addr:$src)))))),
- (VPBROADCASTWYrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast
- (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
- (VPBROADCASTWYrm addr:$src)>;
-
- // FIXME this is to handle aligned extloads from i8.
- def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
- (VPBROADCASTWrm addr:$src)>;
- def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
- (VPBROADCASTWYrm addr:$src)>;
-}
-
-let Predicates = [HasAVX2, NoVLX] in {
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
@@ -7597,10 +7538,6 @@ let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
(VMOVDDUPrr VR128:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
- (VMOVDDUPrm addr:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
- (VMOVDDUPrm addr:$src)>;
}
let Predicates = [HasAVX1Only] in {
@@ -7760,39 +7697,43 @@ let Predicates = [HasAVX2, NoVLX] in {
//
multiclass avx2_pmovmask<string OpcodeStr,
Intrinsic IntLd128, Intrinsic IntLd256,
- Intrinsic IntSt128, Intrinsic IntSt256> {
+ Intrinsic IntSt128, Intrinsic IntSt256,
+ X86SchedWriteMaskMove schedX,
+ X86SchedWriteMaskMove schedY> {
def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
- VEX_4V, Sched<[WriteVecMaskedLoad]>;
+ VEX_4V, Sched<[schedX.RM]>;
def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
- VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
+ VEX_4V, VEX_L, Sched<[schedY.RM]>;
def mr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
- VEX_4V, Sched<[WriteVecMaskedStore]>;
+ VEX_4V, Sched<[schedX.MR]>;
def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
- VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
+ VEX_4V, VEX_L, Sched<[schedY.MR]>;
}
defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
int_x86_avx2_maskload_d,
int_x86_avx2_maskload_d_256,
int_x86_avx2_maskstore_d,
- int_x86_avx2_maskstore_d_256>;
+ int_x86_avx2_maskstore_d_256,
+ WriteVecMaskMove32, WriteVecMaskMove32Y>;
defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
int_x86_avx2_maskload_q,
int_x86_avx2_maskload_q_256,
int_x86_avx2_maskstore_q,
- int_x86_avx2_maskstore_q_256>, VEX_W;
+ int_x86_avx2_maskstore_q_256,
+ WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
ValueType MaskVT> {
@@ -7905,57 +7846,48 @@ let Predicates = [HasAVX2, NoVLX] in {
// FIXME: Improve scheduling of gather instructions.
multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
- ValueType VTy, PatFrag GatherNode128,
- PatFrag GatherNode256, RegisterClass RC256,
+ ValueType VTy, RegisterClass RC256,
X86MemOperand memop128, X86MemOperand memop256,
ValueType MTx = VTx, ValueType MTy = VTy> {
+let mayLoad = 1, hasSideEffects = 0 in {
def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
- (GatherNode128 VR128:$src1, VR128:$mask,
- vectoraddr:$src2))]>,
- VEX, Sched<[WriteLoad]>;
+ []>, VEX, Sched<[WriteLoad]>;
def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
- (GatherNode256 RC256:$src1, RC256:$mask,
- vectoraddr:$src2))]>,
- VEX, VEX_L, Sched<[WriteLoad]>;
+ []>, VEX, VEX_L, Sched<[WriteLoad]>;
+}
}
let Predicates = [HasAVX2] in {
let mayLoad = 1, hasSideEffects = 0, Constraints
= "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
in {
- defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
- mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
- defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
- mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
- defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
- mgatherv8i32, VR256, vx128mem, vy256mem>;
- defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
- mgatherv4i64, VR128, vx64mem, vy128mem>;
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64,
+ VR256, vx128mem, vx256mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64,
+ VR256, vx128mem, vy256mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32,
+ VR256, vx128mem, vy256mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32,
+ VR128, vx64mem, vy128mem>;
let ExeDomain = SSEPackedDouble in {
- defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
- mgatherv4i32, VR256, vx128mem, vx256mem,
- v2i64, v4i64>, VEX_W;
- defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
- mgatherv4i64, VR256, vx128mem, vy256mem,
- v2i64, v4i64>, VEX_W;
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64,
+ VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64,
+ VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W;
}
let ExeDomain = SSEPackedSingle in {
- defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
- mgatherv8i32, VR256, vx128mem, vy256mem,
- v4i32, v8i32>;
- defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
- mgatherv4i64, VR128, vx64mem, vy128mem,
- v4i32, v4i32>;
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32,
+ VR256, vx128mem, vy256mem, v4i32, v8i32>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32,
+ VR128, vx64mem, vy128mem, v4i32, v4i32>;
}
}
}
@@ -7969,8 +7901,8 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
X86MemOperand X86MemOp, bit Is2Addr = 0> {
let ExeDomain = SSEPackedInt,
AsmString = !if(Is2Addr,
- OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
- OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
+ OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
let isCommutable = 1 in
def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
[(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
@@ -7987,8 +7919,8 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
X86MemOperand X86MemOp, bit Is2Addr = 0> {
let AsmString = !if(Is2Addr,
- OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
+ OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), "",
[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
@@ -8008,9 +7940,9 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
VR128, load, i128mem, 1>;
let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
- defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
+ defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
load, i128mem>, VEX_4V, VEX_W;
- defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
+ defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
load, i256mem>, VEX_4V, VEX_L, VEX_W;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 9d974b716dda..823ff78b9903 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -472,19 +472,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"rol{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 relocImm:$src2)))]>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"rol{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 relocImm:$src2)))]>,
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>,
OpSize16;
def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"rol{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 relocImm:$src2)))]>,
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>,
OpSize32;
def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"rol{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 relocImm:$src2)))]>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
// Rotate by 1
def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
@@ -570,19 +570,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
"ror{b}\t{$src2, $dst|$dst, $src2}",
- [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))]>;
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
"ror{w}\t{$src2, $dst|$dst, $src2}",
- [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))]>,
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>,
OpSize16;
def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
"ror{l}\t{$src2, $dst|$dst, $src2}",
- [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))]>,
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>,
OpSize32;
def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst),
(ins GR64:$src1, u8imm:$src2),
"ror{q}\t{$src2, $dst|$dst, $src2}",
- [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))]>;
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
@@ -661,32 +661,32 @@ let Uses = [CL], SchedRW = [WriteSHDrrcl] in {
def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+ [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2, CL))]>,
TB, OpSize16;
def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
(ins GR16:$src1, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+ [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1, CL))]>,
TB, OpSize16;
def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+ [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2, CL))]>,
TB, OpSize32;
def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
(ins GR32:$src1, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+ [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1, CL))]>,
TB, OpSize32;
def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
+ [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2, CL))]>,
TB;
def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
(ins GR64:$src1, GR64:$src2),
"shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
+ [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1, CL))]>,
TB;
} // SchedRW
@@ -695,42 +695,42 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
(outs GR16:$dst),
(ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+ [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2,
(i8 imm:$src3)))]>,
TB, OpSize16;
def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
(outs GR16:$dst),
(ins GR16:$src1, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+ [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1,
(i8 imm:$src3)))]>,
TB, OpSize16;
def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+ [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2,
(i8 imm:$src3)))]>,
TB, OpSize32;
def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
(outs GR32:$dst),
(ins GR32:$src1, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+ [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1,
(i8 imm:$src3)))]>,
TB, OpSize32;
def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+ [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2,
(i8 imm:$src3)))]>,
TB;
def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
(outs GR64:$dst),
(ins GR64:$src1, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+ [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1,
(i8 imm:$src3)))]>,
TB;
} // SchedRW
@@ -739,70 +739,70 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
let Uses = [CL], SchedRW = [WriteSHDmrcl] in {
def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
- addr:$dst)]>, TB, OpSize16;
+ [(store (X86fshl (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>, TB, OpSize16;
def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
"shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
- addr:$dst)]>, TB, OpSize16;
+ [(store (X86fshr GR16:$src2, (loadi16 addr:$dst), CL),
+ addr:$dst)]>, TB, OpSize16;
def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+ [(store (fshl (loadi32 addr:$dst), GR32:$src2, CL),
addr:$dst)]>, TB, OpSize32;
def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
"shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
- addr:$dst)]>, TB, OpSize32;
+ [(store (fshr GR32:$src2, (loadi32 addr:$dst), CL),
+ addr:$dst)]>, TB, OpSize32;
def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
"shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
- addr:$dst)]>, TB;
+ [(store (fshl (loadi64 addr:$dst), GR64:$src2, CL),
+ addr:$dst)]>, TB;
def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
"shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
- [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
- addr:$dst)]>, TB;
+ [(store (fshr GR64:$src2, (loadi64 addr:$dst), CL),
+ addr:$dst)]>, TB;
} // SchedRW
let SchedRW = [WriteSHDmri] in {
def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ [(store (X86fshl (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize16;
def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
"shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ [(store (X86fshr GR16:$src2, (loadi16 addr:$dst),
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize16;
def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ [(store (fshl (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize32;
def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
(outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
"shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ [(store (fshr GR32:$src2, (loadi32 addr:$dst),
+ (i8 imm:$src3)), addr:$dst)]>,
TB, OpSize32;
def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ [(store (fshl (loadi64 addr:$dst), GR64:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
TB;
def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
(outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
"shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
- (i8 imm:$src3)), addr:$dst)]>,
+ [(store (fshr GR64:$src2, (loadi64 addr:$dst),
+ (i8 imm:$src3)), addr:$dst)]>,
TB;
} // SchedRW
@@ -1013,3 +1013,21 @@ let Predicates = [HasBMI2] in {
(INSERT_SUBREG
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
+
+def : Pat<(rotl GR8:$src1, (i8 relocImm:$src2)),
+ (ROL8ri GR8:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR16:$src1, (i8 relocImm:$src2)),
+ (ROL16ri GR16:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR32:$src1, (i8 relocImm:$src2)),
+ (ROL32ri GR32:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR64:$src1, (i8 relocImm:$src2)),
+ (ROL64ri GR64:$src1, relocImm:$src2)>;
+
+def : Pat<(rotr GR8:$src1, (i8 relocImm:$src2)),
+ (ROR8ri GR8:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR16:$src1, (i8 relocImm:$src2)),
+ (ROR16ri GR16:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR32:$src1, (i8 relocImm:$src2)),
+ (ROR32ri GR32:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR64:$src1, (i8 relocImm:$src2)),
+ (ROR64ri GR64:$src1, relocImm:$src2)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
index 7f41feb6c0d9..c23bc7ebbf70 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td
@@ -23,7 +23,20 @@ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in {
def TRAP : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
- def UD2B : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+
+ def UD1Wm : I<0xB9, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2),
+ "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+ def UD1Lm : I<0xB9, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2),
+ "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+ def UD1Qm : RI<0xB9, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+ "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+
+ def UD1Wr : I<0xB9, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+ "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+ def UD1Lr : I<0xB9, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+ "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+ def UD1Qr : RI<0xB9, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
+ "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
}
def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
@@ -149,12 +162,12 @@ def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
// Segment override instruction prefixes
let SchedRW = [WriteNop] in {
-def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
-def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
-def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
-def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
-def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
-def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+def CS_PREFIX : I<0x2E, PrefixByte, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, PrefixByte, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, PrefixByte, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, PrefixByte, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, PrefixByte, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -512,12 +525,12 @@ let SchedRW = [WriteSystem] in {
let SchedRW = [WriteSystem] in {
let Predicates = [HasXSAVE] in {
let Defs = [EDX, EAX], Uses = [ECX] in
- def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+ def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS;
let Uses = [EDX, EAX, ECX] in
def XSETBV : I<0x01, MRM_D1, (outs), (ins),
"xsetbv",
- [(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
+ [(int_x86_xsetbv ECX, EDX, EAX)]>, PS;
} // HasXSAVE
@@ -542,47 +555,47 @@ def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
[(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
"xsavec\t$dst",
- [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>;
+ [(int_x86_xsavec addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC]>;
def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
"xsavec64\t$dst",
- [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>;
+ [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC, In64BitMode]>;
def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
"xsaves\t$dst",
- [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
+ [(int_x86_xsaves addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
"xsaves64\t$dst",
- [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
+ [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
"xrstors\t$dst",
- [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
+ [(int_x86_xrstors addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
"xrstors64\t$dst",
- [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>;
+ [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES, In64BitMode]>;
} // Uses
} // SchedRW
//===----------------------------------------------------------------------===//
// VIA PadLock crypto instructions
let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in
- def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
+ def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB, REP;
def : InstAlias<"xstorerng", (XSTORE)>;
let SchedRW = [WriteSystem] in {
let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
- def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
- def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
- def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
- def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
- def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
+ def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB, REP;
+ def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB, REP;
+ def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB, REP;
+ def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB, REP;
+ def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB, REP;
}
let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
- def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
- def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
+ def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB, REP;
+ def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB, REP;
}
let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
- def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+ def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB, REP;
} // SchedRW
//==-----------------------------------------------------------------------===//
@@ -590,10 +603,10 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
let SchedRW = [WriteSystem] in {
let Defs = [EAX, EDX], Uses = [ECX] in
def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
- [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB;
+ [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, PS;
let Uses = [EAX, ECX, EDX] in
def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
- [(X86wrpkru EAX, EDX, ECX)]>, TB;
+ [(X86wrpkru EAX, EDX, ECX)]>, PS;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -653,15 +666,15 @@ let Predicates = [In64BitMode, HasINVPCID] in {
//===----------------------------------------------------------------------===//
// SMAP Instruction
let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
- def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
- def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, PS;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, PS;
}
//===----------------------------------------------------------------------===//
// SMX Instruction
let SchedRW = [WriteSystem] in {
let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
- def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
+ def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, PS;
} // Uses, Defs
} // SchedRW
@@ -729,6 +742,6 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
let SchedRW = [WriteSystem] in {
let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
- def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB,
+ def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, PS,
Requires<[HasPCONFIG]>;
} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td
index 41b839425ccd..28563eeb4484 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrTSX.td
@@ -37,11 +37,11 @@ def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
}
def XEND : I<0x01, MRM_D5, (outs), (ins),
- "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+ "xend", [(int_x86_xend)]>, PS, Requires<[HasRTM]>;
let Defs = [EFLAGS] in
def XTEST : I<0x01, MRM_D6, (outs), (ins),
- "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>;
+ "xtest", [(set EFLAGS, (X86xtest))]>, PS, Requires<[HasRTM]>;
def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
"xabort\t$imm",
@@ -52,8 +52,8 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
let SchedRW = [WriteSystem] in {
let isAsmParserOnly = 1 in {
-def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>;
-def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>;
+def XACQUIRE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "xacquire", []>;
+def XRELEASE_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "xrelease", []>;
}
} // SchedRW
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td
index 37bc4ce2e053..d204a33358ea 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrVMX.td
@@ -37,7 +37,7 @@ def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
"vmclear\t$vmcs", []>, PD;
// OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, PS;
// 0F 01 C2
def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td
index 229af366d940..a5976b7d2d74 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrXOP.td
@@ -40,14 +40,14 @@ let ExeDomain = SSEPackedInt in {
// Scalar load 2 addr operand instructions
multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
- Operand memop, ComplexPattern mem_cpat,
+ Operand memop, PatFrags mem_frags,
X86FoldableSchedWrite sched> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP,
+ [(set VR128:$dst, (Int (mem_frags addr:$src)))]>, XOP,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
@@ -335,13 +335,13 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
(X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
Sched<[sched]>;
- // FIXME: This pattern can't match.
+ // FIXME: We can't write a pattern for this in tablegen.
+ let hasSideEffects = 0, mayLoad = 1 in
def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
- (X86andnp (load addr:$src3), RC:$src2))))]>,
+ []>,
XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
@@ -383,13 +383,13 @@ let Predicates = [HasXOP] in {
(VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
def : Pat<(or (and VR128:$src3, VR128:$src1),
- (X86andnp VR128:$src3, (bc_v16i8 (loadv2i64 addr:$src2)))),
+ (X86andnp VR128:$src3, (loadv16i8 addr:$src2))),
(VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
def : Pat<(or (and VR128:$src3, VR128:$src1),
- (X86andnp VR128:$src3, (bc_v8i16 (loadv2i64 addr:$src2)))),
+ (X86andnp VR128:$src3, (loadv8i16 addr:$src2))),
(VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
def : Pat<(or (and VR128:$src3, VR128:$src1),
- (X86andnp VR128:$src3, (bc_v4i32 (loadv2i64 addr:$src2)))),
+ (X86andnp VR128:$src3, (loadv4i32 addr:$src2))),
(VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1),
@@ -403,13 +403,13 @@ let Predicates = [HasXOP] in {
(VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
def : Pat<(or (and VR256:$src3, VR256:$src1),
- (X86andnp VR256:$src3, (bc_v32i8 (loadv4i64 addr:$src2)))),
+ (X86andnp VR256:$src3, (loadv32i8 addr:$src2))),
(VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
def : Pat<(or (and VR256:$src3, VR256:$src1),
- (X86andnp VR256:$src3, (bc_v16i16 (loadv4i64 addr:$src2)))),
+ (X86andnp VR256:$src3, (loadv16i16 addr:$src2))),
(VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
def : Pat<(or (and VR256:$src3, VR256:$src1),
- (X86andnp VR256:$src3, (bc_v8i32 (loadv4i64 addr:$src2)))),
+ (X86andnp VR256:$src3, (loadv8i32 addr:$src2))),
(VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 3f9d626ff912..60fb4d2ef4bf 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
#include "X86RegisterBankInfo.h"
@@ -71,7 +72,7 @@ private:
// TODO: remove after supported by Tablegen-erated instruction selection.
unsigned getLoadStoreOp(const LLT &Ty, const RegisterBank &RB, unsigned Opc,
- uint64_t Alignment) const;
+ Align Alignment) const;
bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
@@ -394,7 +395,7 @@ bool X86InstructionSelector::select(MachineInstr &I) {
unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
const RegisterBank &RB,
unsigned Opc,
- uint64_t Alignment) const {
+ Align Alignment) const {
bool Isload = (Opc == TargetOpcode::G_LOAD);
bool HasAVX = STI.hasAVX();
bool HasAVX512 = STI.hasAVX512();
@@ -427,7 +428,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
HasAVX ? X86::VMOVSDmr :
X86::MOVSDmr);
} else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
- if (Alignment >= 16)
+ if (Alignment >= Align(16))
return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
: HasAVX512
? X86::VMOVAPSZ128rm_NOVLX
@@ -446,7 +447,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
? X86::VMOVUPSZ128mr_NOVLX
: HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
} else if (Ty.isVector() && Ty.getSizeInBits() == 256) {
- if (Alignment >= 32)
+ if (Alignment >= Align(32))
return Isload ? (HasVLX ? X86::VMOVAPSZ256rm
: HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
: X86::VMOVAPSYrm)
@@ -461,7 +462,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
: HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
: X86::VMOVUPSYmr);
} else if (Ty.isVector() && Ty.getSizeInBits() == 512) {
- if (Alignment >= 64)
+ if (Alignment >= Align(64))
return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
else
return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
@@ -520,13 +521,13 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n");
return false;
}
- if (MemOp.getAlignment() < Ty.getSizeInBits()/8) {
+ if (MemOp.getAlign() < Ty.getSizeInBits() / 8) {
LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n");
return false;
}
}
- unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
+ unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlign());
if (NewOpc == Opc)
return false;
@@ -1435,14 +1436,15 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
const Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
- unsigned Align = DstTy.getSizeInBits();
+ Align Alignment = Align(DstTy.getSizeInBytes());
const DebugLoc &DbgLoc = I.getDebugLoc();
- unsigned Opc = getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Align);
+ unsigned Opc =
+ getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Alignment);
// Create the load from the constant pool.
const ConstantFP *CFP = I.getOperand(1).getFPImm();
- unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Align);
+ unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Alignment);
MachineInstr *LoadInst = nullptr;
unsigned char OpFlag = STI.classifyLocalReference(nullptr);
@@ -1456,7 +1458,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
- MF.getDataLayout().getPointerSize(), Align);
+ MF.getDataLayout().getPointerSize(), Alignment);
LoadInst =
addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg),
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 8f74a8fe041d..a19e12766e10 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -69,7 +69,7 @@ class X86InterleavedAccessGroup {
/// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
- void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+ void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
SmallVectorImpl<Instruction *> &DecomposedVectors);
/// Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -127,7 +127,7 @@ public:
bool X86InterleavedAccessGroup::isSupported() const {
VectorType *ShuffleVecTy = Shuffles[0]->getType();
- Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+ Type *ShuffleEltTy = ShuffleVecTy->getElementType();
unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
unsigned WideInstSize;
@@ -150,7 +150,7 @@ bool X86InterleavedAccessGroup::isSupported() const {
// We support shuffle represents stride 4 for byte type with size of
// WideInstSize.
if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
- return true;
+ return true;
if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
(WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
@@ -165,7 +165,7 @@ bool X86InterleavedAccessGroup::isSupported() const {
}
void X86InterleavedAccessGroup::decompose(
- Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
+ Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
SmallVectorImpl<Instruction *> &DecomposedVectors) {
assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
"Expected Load or Shuffle");
@@ -186,8 +186,8 @@ void X86InterleavedAccessGroup::decompose(
DecomposedVectors.push_back(
cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
Op0, Op1,
- createSequentialMask(Builder, Indices[i],
- SubVecTy->getVectorNumElements(), 0))));
+ createSequentialMask(Indices[i], SubVecTy->getNumElements(),
+ 0))));
return;
}
@@ -201,7 +201,7 @@ void X86InterleavedAccessGroup::decompose(
// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
if (VecLength == 768 || VecLength == 1536) {
- VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
+ VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
NumLoads = NumSubVectors * (VecLength / 384);
@@ -211,13 +211,20 @@ void X86InterleavedAccessGroup::decompose(
VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
}
// Generate N loads of T type.
+ assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() &&
+ "VecBaseTy's size must be a multiple of 8");
+ const Align FirstAlignment = LI->getAlign();
+ const Align SubsequentAlignment = commonAlignment(
+ FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8);
+ Align Alignment = FirstAlignment;
for (unsigned i = 0; i < NumLoads; i++) {
// TODO: Support inbounds GEP.
Value *NewBasePtr =
Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =
- Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
+ Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
DecomposedVectors.push_back(NewLoad);
+ Alignment = SubsequentAlignment;
}
}
@@ -229,11 +236,11 @@ static MVT scaleVectorType(MVT VT) {
VT.getVectorNumElements() / 2);
}
-static uint32_t Concat[] = {
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
- 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
- 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
+static constexpr int Concat[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
// genShuffleBland - Creates shuffle according to two vectors.This function is
// only works on instructions with lane inside 256 registers. According to
@@ -251,11 +258,11 @@ static uint32_t Concat[] = {
// By computing the shuffle on a sequence of 16 elements(one lane) and add the
// correct offset. We are creating a vpsuffed + blend sequence between two
// shuffles.
-static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
- SmallVectorImpl<uint32_t> &Out, int LowOffset,
- int HighOffset) {
+static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
+ SmallVectorImpl<int> &Out, int LowOffset,
+ int HighOffset) {
assert(VT.getSizeInBits() >= 256 &&
- "This function doesn't accept width smaller then 256");
+ "This function doesn't accept width smaller then 256");
unsigned NumOfElm = VT.getVectorNumElements();
for (unsigned i = 0; i < Mask.size(); i++)
Out.push_back(Mask[i] + LowOffset);
@@ -282,36 +289,35 @@ static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
// Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
- ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
- unsigned VecElems, unsigned Stride,
- IRBuilder<> Builder) {
+ ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
+ unsigned VecElems, unsigned Stride,
+ IRBuilder<> &Builder) {
if (VecElems == 16) {
for (unsigned i = 0; i < Stride; i++)
TransposedMatrix[i] = Builder.CreateShuffleVector(
- Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
+ Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
return;
}
- SmallVector<uint32_t, 32> OptimizeShuf;
+ SmallVector<int, 32> OptimizeShuf;
Value *Temp[8];
for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
- (i + 1) / Stride * 16);
+ (i + 1) / Stride * 16);
Temp[i / 2] = Builder.CreateShuffleVector(
- Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
+ Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
OptimizeShuf.clear();
}
if (VecElems == 32) {
std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
return;
- }
- else
+ } else
for (unsigned i = 0; i < Stride; i++)
TransposedMatrix[i] =
- Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
+ Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
}
void X86InterleavedAccessGroup::interleave8bitStride4VF8(
@@ -325,19 +331,19 @@ void X86InterleavedAccessGroup::interleave8bitStride4VF8(
MVT VT = MVT::v8i16;
TransposedMatrix.resize(2);
- SmallVector<uint32_t, 16> MaskLow;
- SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
- SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
+ SmallVector<int, 16> MaskLow;
+ SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
+ SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
for (unsigned i = 0; i < 8; ++i) {
MaskLow.push_back(i);
MaskLow.push_back(i + 8);
}
- createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
- createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
- scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
- scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
+ createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
+ createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
+ narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
+ narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
// IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
// IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
Value *IntrVec1Low =
@@ -367,25 +373,25 @@ void X86InterleavedAccessGroup::interleave8bitStride4(
MVT HalfVT = scaleVectorType(VT);
TransposedMatrix.resize(4);
- SmallVector<uint32_t, 32> MaskHigh;
- SmallVector<uint32_t, 32> MaskLow;
- SmallVector<uint32_t, 32> LowHighMask[2];
- SmallVector<uint32_t, 32> MaskHighTemp;
- SmallVector<uint32_t, 32> MaskLowTemp;
+ SmallVector<int, 32> MaskHigh;
+ SmallVector<int, 32> MaskLow;
+ SmallVector<int, 32> LowHighMask[2];
+ SmallVector<int, 32> MaskHighTemp;
+ SmallVector<int, 32> MaskLowTemp;
// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
// shuffle pattern.
- createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
- createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
+ createUnpackShuffleMask(VT, MaskLow, true, false);
+ createUnpackShuffleMask(VT, MaskHigh, false, false);
// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
// shuffle pattern.
- createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
- createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
- scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
- scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
+ createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
+ createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
+ narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
+ narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
// IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
@@ -433,7 +439,7 @@ void X86InterleavedAccessGroup::interleave8bitStride4(
// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
static void createShuffleStride(MVT VT, int Stride,
- SmallVectorImpl<uint32_t> &Mask) {
+ SmallVectorImpl<int> &Mask) {
int VectorSize = VT.getSizeInBits();
int VF = VT.getVectorNumElements();
int LaneCount = std::max(VectorSize / 128, 1);
@@ -446,7 +452,7 @@ static void createShuffleStride(MVT VT, int Stride,
// inside mask a shuffleMask. A mask contains exactly 3 groups, where
// each group is a monotonically increasing sequence with stride 3.
// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
-static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
+static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
int VectorSize = VT.getSizeInBits();
int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
@@ -470,7 +476,7 @@ static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
// direction of the alignment. (false - align to the "right" side while true -
// align to the "left" side)
static void DecodePALIGNRMask(MVT VT, unsigned Imm,
- SmallVectorImpl<uint32_t> &ShuffleMask,
+ SmallVectorImpl<int> &ShuffleMask,
bool AlignDirection = true, bool Unary = false) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
@@ -519,7 +525,7 @@ static void DecodePALIGNRMask(MVT VT, unsigned Imm,
// Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
- unsigned VecElems, IRBuilder<> Builder) {
+ unsigned VecElems, IRBuilder<> &Builder) {
if (VecElems == 16) {
for (int i = 0; i < 3; i++)
Vec[i] = InVec[i];
@@ -547,11 +553,11 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
TransposedMatrix.resize(3);
- SmallVector<uint32_t, 32> VPShuf;
- SmallVector<uint32_t, 32> VPAlign[2];
- SmallVector<uint32_t, 32> VPAlign2;
- SmallVector<uint32_t, 32> VPAlign3;
- SmallVector<uint32_t, 3> GroupSize;
+ SmallVector<int, 32> VPShuf;
+ SmallVector<int, 32> VPAlign[2];
+ SmallVector<int, 32> VPAlign2;
+ SmallVector<int, 32> VPAlign3;
+ SmallVector<int, 3> GroupSize;
Value *Vec[6], *TempVector[3];
MVT VT = MVT::getVT(Shuffles[0]->getType());
@@ -605,8 +611,8 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
// group2Shuffle reorder the shuffle stride back into continuous order.
// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
-static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
- SmallVectorImpl<uint32_t> &Output) {
+static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<int> &Output) {
int IndexGroup[3] = {0, 0, 0};
int Index = 0;
int VectorWidth = VT.getSizeInBits();
@@ -633,11 +639,11 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
// Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
TransposedMatrix.resize(3);
- SmallVector<uint32_t, 3> GroupSize;
- SmallVector<uint32_t, 32> VPShuf;
- SmallVector<uint32_t, 32> VPAlign[3];
- SmallVector<uint32_t, 32> VPAlign2;
- SmallVector<uint32_t, 32> VPAlign3;
+ SmallVector<int, 3> GroupSize;
+ SmallVector<int, 32> VPShuf;
+ SmallVector<int, 32> VPAlign[3];
+ SmallVector<int, 32> VPAlign2;
+ SmallVector<int, 32> VPAlign3;
Value *Vec[3], *TempVector[3];
MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
@@ -682,7 +688,7 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
unsigned NumOfElm = VT.getVectorNumElements();
group2Shuffle(VT, GroupSize, VPShuf);
- reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
+ reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
}
void X86InterleavedAccessGroup::transpose_4x4(
@@ -692,25 +698,25 @@ void X86InterleavedAccessGroup::transpose_4x4(
TransposedMatrix.resize(4);
// dst = src1[0,1],src2[0,1]
- uint32_t IntMask1[] = {0, 1, 4, 5};
- ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
+ static constexpr int IntMask1[] = {0, 1, 4, 5};
+ ArrayRef<int> Mask = makeArrayRef(IntMask1, 4);
Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
// dst = src1[2,3],src2[2,3]
- uint32_t IntMask2[] = {2, 3, 6, 7};
+ static constexpr int IntMask2[] = {2, 3, 6, 7};
Mask = makeArrayRef(IntMask2, 4);
Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
// dst = src1[0],src2[0],src1[2],src2[2]
- uint32_t IntMask3[] = {0, 4, 2, 6};
+ static constexpr int IntMask3[] = {0, 4, 2, 6};
Mask = makeArrayRef(IntMask3, 4);
TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
// dst = src1[1],src2[1],src1[3],src2[3]
- uint32_t IntMask4[] = {1, 5, 3, 7};
+ static constexpr int IntMask4[] = {1, 5, 3, 7};
Mask = makeArrayRef(IntMask4, 4);
TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
@@ -721,14 +727,14 @@ void X86InterleavedAccessGroup::transpose_4x4(
bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
SmallVector<Instruction *, 4> DecomposedVectors;
SmallVector<Value *, 4> TransposedVectors;
- VectorType *ShuffleTy = Shuffles[0]->getType();
+ auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
if (isa<LoadInst>(Inst)) {
// Try to generate target-sized register(/instruction).
decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
- Type *ShuffleEltTy = Inst->getType();
- unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
+ auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
+ unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
// Perform matrix-transposition in order to compute interleaved
// results by generating some sort of (optimized) target-specific
// instructions.
@@ -756,13 +762,14 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
return true;
}
- Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
- unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
+ Type *ShuffleEltTy = ShuffleTy->getElementType();
+ unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
// Lower the interleaved stores:
// 1. Decompose the interleaved wide shuffle into individual shuffle
// vectors.
- decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
+ decompose(Shuffles[0], Factor,
+ FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
DecomposedVectors);
// 2. Transpose the interleaved-vectors into vectors of contiguous
@@ -793,8 +800,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// 4. Generate a store instruction for wide-vec.
StoreInst *SI = cast<StoreInst>(Inst);
- Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
- SI->getAlignment());
+ Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
return true;
}
@@ -826,7 +832,8 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
- assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
+ assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
+ 0 &&
"Invalid interleaved store");
// Holds the indices of SVI that correspond to the starting index of each
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 40bf28df3b90..1c10c07abeee 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -679,8 +679,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, TRUNCATE_TO_REG,
X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
- X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
- X86ISD::VTRUNCS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, TRUNCATE_TO_REG,
+ X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
X86ISD::VTRUNCS, 0),
X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
@@ -783,10 +783,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSUBS, X86ISD::FSUBS_RND),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK,
X86ISD::FSUBS, X86ISD::FSUBS_RND),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
- X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
- X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE,
X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK,
@@ -997,7 +993,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0),
+ X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0),
+ X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0),
+ X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0),
+ X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
+ X86_INTRINSIC_DATA(sse_cmp_ss, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),
@@ -1022,6 +1027,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT),
X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE),
X86_INTRINSIC_DATA(sse2_cmp_pd, INTR_TYPE_3OP, X86ISD::CMPP, 0),
+ X86_INTRINSIC_DATA(sse2_cmp_sd, INTR_TYPE_3OP, X86ISD::FSETCC, 0),
X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE),
X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT),
@@ -1104,8 +1110,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(subborrow_64, ADX, X86ISD::SBB, X86ISD::SUB),
X86_INTRINSIC_DATA(tbm_bextri_u32, BEXTRI, X86ISD::BEXTR, 0),
X86_INTRINSIC_DATA(tbm_bextri_u64, BEXTRI, X86ISD::BEXTR, 0),
- X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
@@ -1157,10 +1161,8 @@ static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
}
static void verifyIntrinsicTables() {
- assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
- std::end(IntrinsicsWithoutChain)) &&
- std::is_sorted(std::begin(IntrinsicsWithChain),
- std::end(IntrinsicsWithChain)) &&
+ assert(llvm::is_sorted(IntrinsicsWithoutChain) &&
+ llvm::is_sorted(IntrinsicsWithChain) &&
"Intrinsic data tables should be sorted by Intrinsic ID");
assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
std::end(IntrinsicsWithoutChain)) ==
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index da53d6420021..84f560f2f9ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -85,14 +85,14 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
verify(*STI.getInstrInfo());
}
-bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
+bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
switch (MI.getIntrinsicID()) {
case Intrinsic::memcpy:
case Intrinsic::memset:
case Intrinsic::memmove:
- if (createMemLibcall(MIRBuilder, MRI, MI) ==
+ if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
LegalizerHelper::UnableToLegalize)
return false;
MI.eraseFromParent();
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h
index 7a0f13fb5ae6..72d25096d72b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LegalizerInfo.h
@@ -32,8 +32,8 @@ private:
public:
X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
- bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const override;
+ bool legalizeIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI) const override;
private:
void setLegalizerInfo32bit();
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 35fc439998f9..50f8b3477acc 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -822,79 +822,3 @@ INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningPass() {
return new X86LoadValueInjectionLoadHardeningPass();
}
-
-namespace {
-
-/// The `X86LoadValueInjectionLoadHardeningPass` above depends on expensive
-/// analysis passes that add complexity to the pipeline. This complexity
-/// can cause noticable overhead when no optimizations are enabled, i.e., -O0.
-/// The purpose of `X86LoadValueInjectionLoadHardeningUnoptimizedPass` is to
-/// provide the same security as the optimized pass, but without adding
-/// unnecessary complexity to the LLVM pipeline.
-///
-/// The behavior of this pass is simply to insert an LFENCE after every load
-/// instruction.
-class X86LoadValueInjectionLoadHardeningUnoptimizedPass
- : public MachineFunctionPass {
-public:
- X86LoadValueInjectionLoadHardeningUnoptimizedPass()
- : MachineFunctionPass(ID) {}
-
- StringRef getPassName() const override {
- return "X86 Load Value Injection (LVI) Load Hardening (Unoptimized)";
- }
- bool runOnMachineFunction(MachineFunction &MF) override;
- static char ID;
-};
-
-} // end anonymous namespace
-
-char X86LoadValueInjectionLoadHardeningUnoptimizedPass::ID = 0;
-
-bool X86LoadValueInjectionLoadHardeningUnoptimizedPass::runOnMachineFunction(
- MachineFunction &MF) {
- LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
- << " *****\n");
- const X86Subtarget *STI = &MF.getSubtarget<X86Subtarget>();
- if (!STI->useLVILoadHardening())
- return false;
-
- // FIXME: support 32-bit
- if (!STI->is64Bit())
- report_fatal_error("LVI load hardening is only supported on 64-bit", false);
-
- // Don't skip functions with the "optnone" attr but participate in opt-bisect.
- const Function &F = MF.getFunction();
- if (!F.hasOptNone() && skipFunction(F))
- return false;
-
- bool Modified = false;
- ++NumFunctionsConsidered;
-
- const TargetInstrInfo *TII = STI->getInstrInfo();
- for (auto &MBB : MF) {
- for (auto &MI : MBB) {
- if (!MI.mayLoad() || MI.getOpcode() == X86::LFENCE ||
- MI.getOpcode() == X86::MFENCE)
- continue;
-
- MachineBasicBlock::iterator InsertionPt =
- MI.getNextNode() ? MI.getNextNode() : MBB.end();
- BuildMI(MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
- ++NumFences;
- Modified = true;
- }
- }
-
- if (Modified)
- ++NumFunctionsMitigated;
-
- return Modified;
-}
-
-INITIALIZE_PASS(X86LoadValueInjectionLoadHardeningUnoptimizedPass, PASS_KEY,
- "X86 LVI load hardening", false, false)
-
-FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningUnoptimizedPass() {
- return new X86LoadValueInjectionLoadHardeningUnoptimizedPass();
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
index f5caaaae4d84..9ce2a4637e2e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -14,11 +14,12 @@
#include "MCTargetDesc/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86InstComments.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
#include "MCTargetDesc/X86TargetStreamer.h"
-#include "Utils/X86ShuffleDecode.h"
#include "X86AsmPrinter.h"
#include "X86RegisterInfo.h"
#include "X86ShuffleDecodeConstantPool.h"
+#include "X86Subtarget.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/iterator_range.h"
@@ -43,6 +44,7 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -72,9 +74,30 @@ private:
} // end anonymous namespace
+/// A RAII helper which defines a region of instructions which can't have
+/// padding added between them for correctness.
+struct NoAutoPaddingScope {
+ MCStreamer &OS;
+ const bool OldAllowAutoPadding;
+ NoAutoPaddingScope(MCStreamer &OS)
+ : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
+ changeAndComment(false);
+ }
+ ~NoAutoPaddingScope() { changeAndComment(OldAllowAutoPadding); }
+ void changeAndComment(bool b) {
+ if (b == OS.getAllowAutoPadding())
+ return;
+ OS.setAllowAutoPadding(b);
+ if (b)
+ OS.emitRawComment("autopadding");
+ else
+ OS.emitRawComment("noautopadding");
+ }
+};
+
// Emit a minimal sequence of nops spanning NumBytes bytes.
-static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
- const MCSubtargetInfo &STI);
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+ const X86Subtarget *Subtarget);
void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
const MCSubtargetInfo &STI,
@@ -94,13 +117,13 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
if (InShadow && CurrentShadowSize < RequiredShadowSize) {
InShadow = false;
- EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
- MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
+ emitX86Nops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+ &MF->getSubtarget<X86Subtarget>());
}
}
void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
- OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+ OutStreamer->emitInstruction(Inst, getSubtargetInfo());
SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
}
@@ -116,6 +139,10 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
/// operand to an MCSymbol.
MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
+ const Triple &TT = TM.getTargetTriple();
+ if (MO.isGlobal() && TT.isOSBinFormatELF())
+ return AsmPrinter.getSymbolPreferLocal(*MO.getGlobal());
+
const DataLayout &DL = MF.getDataLayout();
assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) &&
"Isn't a symbol reference");
@@ -272,7 +299,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
// local labels. This is only safe when the symbols are in the same
// section so we are restricting it to jumptable references.
MCSymbol *Label = Ctx.createTempSymbol();
- AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
+ AsmPrinter.OutStreamer->emitAssignment(Label, Expr);
Expr = MCSymbolRefExpr::create(Label, Ctx);
}
break;
@@ -482,6 +509,26 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
"LEA has segment specified!");
break;
+ case X86::MULX32Hrr:
+ case X86::MULX32Hrm:
+ case X86::MULX64Hrr:
+ case X86::MULX64Hrm: {
+ // Turn into regular MULX by duplicating the destination.
+ unsigned NewOpc;
+ switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
+ case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break;
+ case X86::MULX32Hrm: NewOpc = X86::MULX32rm; break;
+ case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break;
+ case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break;
+ }
+ OutMI.setOpcode(NewOpc);
+ // Duplicate the destination.
+ unsigned DestReg = OutMI.getOperand(0).getReg();
+ OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg));
+ break;
+ }
+
// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
// if one of the registers is extended, but other isn't.
case X86::VMOVZPQILo2PQIrr:
@@ -929,6 +976,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
const MachineInstr &MI) {
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
MI.getOpcode() == X86::TLS_base_addr64;
MCContext &Ctx = OutStreamer->getContext();
@@ -1034,29 +1082,26 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
/// Return the longest nop which can be efficiently decoded for the given
/// target cpu. 15-bytes is the longest single NOP instruction, but some
/// platforms can't decode the longest forms efficiently.
-static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) {
- uint64_t MaxNopLength = 10;
- if (STI.getFeatureBits()[X86::ProcIntelSLM])
- MaxNopLength = 7;
- else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
- MaxNopLength = 15;
- else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
- MaxNopLength = 11;
- return MaxNopLength;
+static unsigned maxLongNopLength(const X86Subtarget *Subtarget) {
+ if (Subtarget->getFeatureBits()[X86::ProcIntelSLM])
+ return 7;
+ if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP])
+ return 15;
+ if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP])
+ return 11;
+ if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit())
+ return 10;
+ if (Subtarget->is32Bit())
+ return 2;
+ return 1;
}
/// Emit the largest nop instruction smaller than or equal to \p NumBytes
/// bytes. Return the size of nop emitted.
-static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
- const MCSubtargetInfo &STI) {
- if (!Is64Bit) {
- // TODO Do additional checking if the CPU supports multi-byte nops.
- OS.EmitInstruction(MCInstBuilder(X86::NOOP), STI);
- return 1;
- }
-
+static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
+ const X86Subtarget *Subtarget) {
// Cap a single nop emission at the profitable value for the target
- NumBytes = std::min(NumBytes, MaxLongNopLength(STI));
+ NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget));
unsigned NopSize;
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1125,25 +1170,26 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
NopSize += NumPrefixes;
for (unsigned i = 0; i != NumPrefixes; ++i)
- OS.EmitBytes("\x66");
+ OS.emitBytes("\x66");
switch (Opc) {
default: llvm_unreachable("Unexpected opcode");
case X86::NOOP:
- OS.EmitInstruction(MCInstBuilder(Opc), STI);
+ OS.emitInstruction(MCInstBuilder(Opc), *Subtarget);
break;
case X86::XCHG16ar:
- OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), STI);
+ OS.emitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX),
+ *Subtarget);
break;
case X86::NOOPL:
case X86::NOOPW:
- OS.EmitInstruction(MCInstBuilder(Opc)
+ OS.emitInstruction(MCInstBuilder(Opc)
.addReg(BaseReg)
.addImm(ScaleVal)
.addReg(IndexReg)
.addImm(Displacement)
.addReg(SegmentReg),
- STI);
+ *Subtarget);
break;
}
assert(NopSize <= NumBytes && "We overemitted?");
@@ -1151,39 +1197,16 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
}
/// Emit the optimal amount of multi-byte nops on X86.
-static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
- const MCSubtargetInfo &STI) {
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+ const X86Subtarget *Subtarget) {
unsigned NopsToEmit = NumBytes;
(void)NopsToEmit;
while (NumBytes) {
- NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
+ NumBytes -= emitNop(OS, NumBytes, Subtarget);
assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
}
}
-/// A RAII helper which defines a region of instructions which can't have
-/// padding added between them for correctness.
-struct NoAutoPaddingScope {
- MCStreamer &OS;
- const bool OldAllowAutoPadding;
- NoAutoPaddingScope(MCStreamer &OS)
- : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
- changeAndComment(false);
- }
- ~NoAutoPaddingScope() {
- changeAndComment(OldAllowAutoPadding);
- }
- void changeAndComment(bool b) {
- if (b == OS.getAllowAutoPadding())
- return;
- OS.setAllowAutoPadding(b);
- if (b)
- OS.emitRawComment("autopadding");
- else
- OS.emitRawComment("noautopadding");
- }
-};
-
void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
X86MCInstLower &MCIL) {
assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
@@ -1192,8 +1215,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
StatepointOpers SOpers(&MI);
if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
- EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
- getSubtargetInfo());
+ emitX86Nops(*OutStreamer, PatchBytes, Subtarget);
} else {
// Lower call target and choose correct opcode
const MachineOperand &CallTarget = SOpers.getCallTarget();
@@ -1235,14 +1257,14 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
MCInst CallInst;
CallInst.setOpcode(CallOpcode);
CallInst.addOperand(CallTargetMCOp);
- OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
+ OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
}
// Record our statepoint node in the same section used by STACKMAP
// and PATCHPOINT
auto &Ctx = OutStreamer->getContext();
MCSymbol *MILabel = Ctx.createTempSymbol();
- OutStreamer->EmitLabel(MILabel);
+ OutStreamer->emitLabel(MILabel);
SM.recordStatepoint(*MILabel, MI);
}
@@ -1262,7 +1284,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
auto &Ctx = OutStreamer->getContext();
MCSymbol *FaultingLabel = Ctx.createTempSymbol();
- OutStreamer->EmitLabel(FaultingLabel);
+ OutStreamer->emitLabel(FaultingLabel);
assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
@@ -1280,7 +1302,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
MI.addOperand(MaybeOperand.getValue());
OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
- OutStreamer->EmitInstruction(MI, getSubtargetInfo());
+ OutStreamer->emitInstruction(MI, getSubtargetInfo());
}
void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
@@ -1317,7 +1339,17 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
if (Code.size() < MinSize) {
- if (MinSize == 2 && Opcode == X86::PUSH64r) {
+ if (MinSize == 2 && Subtarget->is32Bit() &&
+ Subtarget->isTargetWindowsMSVC() &&
+ (Subtarget->getCPU().empty() || Subtarget->getCPU() == "pentium3")) {
+ // For compatibilty reasons, when targetting MSVC, is is important to
+ // generate a 'legacy' NOP in the form of a 8B FF MOV EDI, EDI. Some tools
+ // rely specifically on this pattern to be able to patch a function.
+ // This is only for 32-bit targets, when using /arch:IA32 or /arch:SSE.
+ OutStreamer->emitInstruction(
+ MCInstBuilder(X86::MOV32rr_REV).addReg(X86::EDI).addReg(X86::EDI),
+ *Subtarget);
+ } else if (MinSize == 2 && Opcode == X86::PUSH64r) {
// This is an optimization that lets us get away without emitting a nop in
// many cases.
//
@@ -1325,14 +1357,13 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
// bytes too, so the check on MinSize is important.
MCI.setOpcode(X86::PUSH64rmr);
} else {
- unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
- getSubtargetInfo());
+ unsigned NopSize = emitNop(*OutStreamer, MinSize, Subtarget);
assert(NopSize == MinSize && "Could not implement MinSize!");
(void)NopSize;
}
}
- OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
+ OutStreamer->emitInstruction(MCI, getSubtargetInfo());
}
// Lower a stackmap of the form:
@@ -1342,7 +1373,7 @@ void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
auto &Ctx = OutStreamer->getContext();
MCSymbol *MILabel = Ctx.createTempSymbol();
- OutStreamer->EmitLabel(MILabel);
+ OutStreamer->emitLabel(MILabel);
SM.recordStackMap(*MILabel, MI);
unsigned NumShadowBytes = MI.getOperand(1).getImm();
@@ -1361,7 +1392,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
auto &Ctx = OutStreamer->getContext();
MCSymbol *MILabel = Ctx.createTempSymbol();
- OutStreamer->EmitLabel(MILabel);
+ OutStreamer->emitLabel(MILabel);
SM.recordPatchPoint(*MILabel, MI);
PatchPointOpers opers(&MI);
@@ -1410,8 +1441,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
assert(NumBytes >= EncodedBytes &&
"Patchpoint can't request size less than the length of a call.");
- EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
- getSubtargetInfo());
+ emitX86Nops(*OutStreamer, NumBytes - EncodedBytes, Subtarget);
}
void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
@@ -1442,13 +1472,13 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
// First we emit the label and the jump.
auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true);
OutStreamer->AddComment("# XRay Custom Event Log");
- OutStreamer->EmitCodeAlignment(2);
- OutStreamer->EmitLabel(CurSled);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
- OutStreamer->EmitBinaryData("\xeb\x0f");
+ OutStreamer->emitBinaryData("\xeb\x0f");
// The default C calling convention will place two arguments into %rcx and
// %rdx -- so we only work with those.
@@ -1471,7 +1501,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
EmitAndCountInstruction(
MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
} else {
- EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+ emitX86Nops(*OutStreamer, 4, Subtarget);
}
}
@@ -1500,14 +1530,14 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
if (UsedMask[I])
EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
else
- EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+ emitX86Nops(*OutStreamer, 1, Subtarget);
OutStreamer->AddComment("xray custom event end.");
- // Record the sled version. Older versions of this sled were spelled
- // differently, so we let the runtime handle the different offsets we're
- // using.
- recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1);
+ // Record the sled version. Version 0 of this sled was spelled differently, so
+ // we let the runtime handle the different offsets we're using. Version 2
+ // changed the absolute address to a PC-relative address.
+ recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 2);
}
void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
@@ -1538,13 +1568,13 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
// First we emit the label and the jump.
auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true);
OutStreamer->AddComment("# XRay Typed Event Log");
- OutStreamer->EmitCodeAlignment(2);
- OutStreamer->EmitLabel(CurSled);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
- OutStreamer->EmitBinaryData("\xeb\x14");
+ OutStreamer->emitBinaryData("\xeb\x14");
// An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
// so we'll work with those. Or we may be called via SystemV, in which case
@@ -1569,7 +1599,7 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
EmitAndCountInstruction(
MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
} else {
- EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+ emitX86Nops(*OutStreamer, 4, Subtarget);
}
}
@@ -1603,12 +1633,12 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
if (UsedMask[I])
EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
else
- EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+ emitX86Nops(*OutStreamer, 1, Subtarget);
OutStreamer->AddComment("xray typed event end.");
// Record the sled version.
- recordSled(CurSled, MI, SledKind::TYPED_EVENT, 0);
+ recordSled(CurSled, MI, SledKind::TYPED_EVENT, 2);
}
void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
@@ -1623,7 +1653,7 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
.getValueAsString()
.getAsInteger(10, Num))
return;
- EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo());
+ emitX86Nops(*OutStreamer, Num, Subtarget);
return;
}
// We want to emit the following pattern:
@@ -1640,15 +1670,15 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
// call <relative offset, 32-bits> // 5 bytes
//
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
- OutStreamer->EmitCodeAlignment(2);
- OutStreamer->EmitLabel(CurSled);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
- OutStreamer->EmitBytes("\xeb\x09");
- EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
- recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
+ OutStreamer->emitBytes("\xeb\x09");
+ emitX86Nops(*OutStreamer, 9, Subtarget);
+ recordSled(CurSled, MI, SledKind::FUNCTION_ENTER, 2);
}
void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
@@ -1670,17 +1700,17 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
//
// This just makes sure that the alignment for the next instruction is 2.
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
- OutStreamer->EmitCodeAlignment(2);
- OutStreamer->EmitLabel(CurSled);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
unsigned OpCode = MI.getOperand(0).getImm();
MCInst Ret;
Ret.setOpcode(OpCode);
for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
Ret.addOperand(MaybeOperand.getValue());
- OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
- EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
- recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
+ OutStreamer->emitInstruction(Ret, getSubtargetInfo());
+ emitX86Nops(*OutStreamer, 10, Subtarget);
+ recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2);
}
void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
@@ -1694,17 +1724,17 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
// the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
// tail call much like how we have it in PATCHABLE_RET.
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
- OutStreamer->EmitCodeAlignment(2);
- OutStreamer->EmitLabel(CurSled);
+ OutStreamer->emitCodeAlignment(2);
+ OutStreamer->emitLabel(CurSled);
auto Target = OutContext.createTempSymbol();
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
- OutStreamer->EmitBytes("\xeb\x09");
- EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
- OutStreamer->EmitLabel(Target);
- recordSled(CurSled, MI, SledKind::TAIL_CALL);
+ OutStreamer->emitBytes("\xeb\x09");
+ emitX86Nops(*OutStreamer, 9, Subtarget);
+ OutStreamer->emitLabel(Target);
+ recordSled(CurSled, MI, SledKind::TAIL_CALL, 2);
unsigned OpCode = MI.getOperand(0).getImm();
OpCode = convertTailJumpOpcode(OpCode);
@@ -1717,7 +1747,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
TC.addOperand(MaybeOperand.getValue());
- OutStreamer->EmitInstruction(TC, getSubtargetInfo());
+ OutStreamer->emitInstruction(TC, getSubtargetInfo());
}
// Returns instruction preceding MBBI in MachineFunction.
@@ -1961,300 +1991,9 @@ static unsigned getRegisterWidth(const MCOperandInfo &Info) {
llvm_unreachable("Unknown register class!");
}
-void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
- X86MCInstLower MCInstLowering(*MF, *this);
- const X86RegisterInfo *RI =
- MF->getSubtarget<X86Subtarget>().getRegisterInfo();
-
- // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
- // are compressed from EVEX encoding to VEX encoding.
- if (TM.Options.MCOptions.ShowMCEncoding) {
- if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
- OutStreamer->AddComment("EVEX TO VEX Compression ", false);
- }
-
+static void addConstantComments(const MachineInstr *MI,
+ MCStreamer &OutStreamer) {
switch (MI->getOpcode()) {
- case TargetOpcode::DBG_VALUE:
- llvm_unreachable("Should be handled target independently");
-
- // Emit nothing here but a comment if we can.
- case X86::Int_MemBarrier:
- OutStreamer->emitRawComment("MEMBARRIER");
- return;
-
- case X86::EH_RETURN:
- case X86::EH_RETURN64: {
- // Lower these as normal, but add some comments.
- Register Reg = MI->getOperand(0).getReg();
- OutStreamer->AddComment(StringRef("eh_return, addr: %") +
- X86ATTInstPrinter::getRegisterName(Reg));
- break;
- }
- case X86::CLEANUPRET: {
- // Lower these as normal, but add some comments.
- OutStreamer->AddComment("CLEANUPRET");
- break;
- }
-
- case X86::CATCHRET: {
- // Lower these as normal, but add some comments.
- OutStreamer->AddComment("CATCHRET");
- break;
- }
-
- case X86::ENDBR32:
- case X86::ENDBR64: {
- // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
- // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
- // non-empty. If MI is the initial ENDBR, place the
- // __patchable_function_entries label after ENDBR.
- if (CurrentPatchableFunctionEntrySym &&
- CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
- MI == &MF->front().front()) {
- MCInst Inst;
- MCInstLowering.Lower(MI, Inst);
- EmitAndCountInstruction(Inst);
- CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
- OutStreamer->EmitLabel(CurrentPatchableFunctionEntrySym);
- return;
- }
- break;
- }
-
- case X86::TAILJMPr:
- case X86::TAILJMPm:
- case X86::TAILJMPd:
- case X86::TAILJMPd_CC:
- case X86::TAILJMPr64:
- case X86::TAILJMPm64:
- case X86::TAILJMPd64:
- case X86::TAILJMPd64_CC:
- case X86::TAILJMPr64_REX:
- case X86::TAILJMPm64_REX:
- // Lower these as normal, but add some comments.
- OutStreamer->AddComment("TAILCALL");
- break;
-
- case X86::TLS_addr32:
- case X86::TLS_addr64:
- case X86::TLS_base_addr32:
- case X86::TLS_base_addr64:
- return LowerTlsAddr(MCInstLowering, *MI);
-
- // Loading/storing mask pairs requires two kmov operations. The second one of these
- // needs a 2 byte displacement relative to the specified address (with 32 bit spill
- // size). The pairs of 1bit masks up to 16 bit masks all use the same spill size,
- // they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD.
- //
- // The displacement value might wrap around in theory, thus the asserts in both
- // cases.
- case X86::MASKPAIR16LOAD: {
- int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
- assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
- Register Reg = MI->getOperand(0).getReg();
- Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
- Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
-
- // Load the first mask register
- MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
- MIB.addReg(Reg0);
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
- MIB.addOperand(Op.getValue());
- }
- EmitAndCountInstruction(MIB);
-
- // Load the second mask register of the pair
- MIB = MCInstBuilder(X86::KMOVWkm);
- MIB.addReg(Reg1);
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- if (i == X86::AddrDisp) {
- MIB.addImm(Disp + 2);
- } else {
- auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
- MIB.addOperand(Op.getValue());
- }
- }
- EmitAndCountInstruction(MIB);
- return;
- }
-
- case X86::MASKPAIR16STORE: {
- int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
- assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
- Register Reg = MI->getOperand(X86::AddrNumOperands).getReg();
- Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
- Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
-
- // Store the first mask register
- MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
- for (int i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue());
- MIB.addReg(Reg0);
- EmitAndCountInstruction(MIB);
-
- // Store the second mask register of the pair
- MIB = MCInstBuilder(X86::KMOVWmk);
- for (int i = 0; i < X86::AddrNumOperands; ++i) {
- if (i == X86::AddrDisp) {
- MIB.addImm(Disp + 2);
- } else {
- auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i));
- MIB.addOperand(Op.getValue());
- }
- }
- MIB.addReg(Reg1);
- EmitAndCountInstruction(MIB);
- return;
- }
-
- case X86::MOVPC32r: {
- // This is a pseudo op for a two instruction sequence with a label, which
- // looks like:
- // call "L1$pb"
- // "L1$pb":
- // popl %esi
-
- // Emit the call.
- MCSymbol *PICBase = MF->getPICBaseSymbol();
- // FIXME: We would like an efficient form for this, so we don't have to do a
- // lot of extra uniquing.
- EmitAndCountInstruction(
- MCInstBuilder(X86::CALLpcrel32)
- .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
-
- const X86FrameLowering *FrameLowering =
- MF->getSubtarget<X86Subtarget>().getFrameLowering();
- bool hasFP = FrameLowering->hasFP(*MF);
-
- // TODO: This is needed only if we require precise CFA.
- bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
- !OutStreamer->getDwarfFrameInfos().back().End;
-
- int stackGrowth = -RI->getSlotSize();
-
- if (HasActiveDwarfFrame && !hasFP) {
- OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
- }
-
- // Emit the label.
- OutStreamer->EmitLabel(PICBase);
-
- // popl $reg
- EmitAndCountInstruction(
- MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
-
- if (HasActiveDwarfFrame && !hasFP) {
- OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
- }
- return;
- }
-
- case X86::ADD32ri: {
- // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
- if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
- break;
-
- // Okay, we have something like:
- // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
-
- // For this, we want to print something like:
- // MYGLOBAL + (. - PICBASE)
- // However, we can't generate a ".", so just emit a new label here and refer
- // to it.
- MCSymbol *DotSym = OutContext.createTempSymbol();
- OutStreamer->EmitLabel(DotSym);
-
- // Now that we have emitted the label, lower the complex operand expression.
- MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
-
- const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
- const MCExpr *PICBase =
- MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
- DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
-
- DotExpr = MCBinaryExpr::createAdd(
- MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
-
- EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
- .addReg(MI->getOperand(0).getReg())
- .addReg(MI->getOperand(1).getReg())
- .addExpr(DotExpr));
- return;
- }
- case TargetOpcode::STATEPOINT:
- return LowerSTATEPOINT(*MI, MCInstLowering);
-
- case TargetOpcode::FAULTING_OP:
- return LowerFAULTING_OP(*MI, MCInstLowering);
-
- case TargetOpcode::FENTRY_CALL:
- return LowerFENTRY_CALL(*MI, MCInstLowering);
-
- case TargetOpcode::PATCHABLE_OP:
- return LowerPATCHABLE_OP(*MI, MCInstLowering);
-
- case TargetOpcode::STACKMAP:
- return LowerSTACKMAP(*MI);
-
- case TargetOpcode::PATCHPOINT:
- return LowerPATCHPOINT(*MI, MCInstLowering);
-
- case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
- return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
-
- case TargetOpcode::PATCHABLE_RET:
- return LowerPATCHABLE_RET(*MI, MCInstLowering);
-
- case TargetOpcode::PATCHABLE_TAIL_CALL:
- return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
-
- case TargetOpcode::PATCHABLE_EVENT_CALL:
- return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
-
- case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
- return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
-
- case X86::MORESTACK_RET:
- EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
- return;
-
- case X86::MORESTACK_RET_RESTORE_R10:
- // Return, then restore R10.
- EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
- EmitAndCountInstruction(
- MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
- return;
-
- case X86::SEH_PushReg:
- case X86::SEH_SaveReg:
- case X86::SEH_SaveXMM:
- case X86::SEH_StackAlloc:
- case X86::SEH_StackAlign:
- case X86::SEH_SetFrame:
- case X86::SEH_PushFrame:
- case X86::SEH_EndPrologue:
- EmitSEHInstruction(MI);
- return;
-
- case X86::SEH_Epilogue: {
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- MachineBasicBlock::const_iterator MBBI(MI);
- // Check if preceded by a call and emit nop if so.
- for (MBBI = PrevCrossBBInst(MBBI);
- MBBI != MachineBasicBlock::const_iterator();
- MBBI = PrevCrossBBInst(MBBI)) {
- // Conservatively assume that pseudo instructions don't emit code and keep
- // looking for a call. We may emit an unnecessary nop in some cases.
- if (!MBBI->isPseudo()) {
- if (MBBI->isCall())
- EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
- break;
- }
- }
- return;
- }
-
// Lower PSHUFB and VPERMILP normally but add a comment if we can find
// a constant shuffle mask. We won't be able to do this at the MC layer
// because the mask isn't an immediate.
@@ -2270,30 +2009,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPSHUFBZrm:
case X86::VPSHUFBZrmk:
case X86::VPSHUFBZrmkz: {
- if (!OutStreamer->isVerboseAsm())
- break;
- unsigned SrcIdx, MaskIdx;
- switch (MI->getOpcode()) {
- default: llvm_unreachable("Invalid opcode");
- case X86::PSHUFBrm:
- case X86::VPSHUFBrm:
- case X86::VPSHUFBYrm:
- case X86::VPSHUFBZ128rm:
- case X86::VPSHUFBZ256rm:
- case X86::VPSHUFBZrm:
- SrcIdx = 1; MaskIdx = 5; break;
- case X86::VPSHUFBZ128rmkz:
- case X86::VPSHUFBZ256rmkz:
- case X86::VPSHUFBZrmkz:
- SrcIdx = 2; MaskIdx = 6; break;
- case X86::VPSHUFBZ128rmk:
- case X86::VPSHUFBZ256rmk:
- case X86::VPSHUFBZrmk:
- SrcIdx = 3; MaskIdx = 7; break;
+ unsigned SrcIdx = 1;
+ if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+ // Skip mask operand.
+ ++SrcIdx;
+ if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+ // Skip passthru operand.
+ ++SrcIdx;
+ }
}
+ unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
- assert(MI->getNumOperands() >= 6 &&
- "We should always have at least 6 operands!");
+ assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
@@ -2301,7 +2029,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 64> Mask;
DecodePSHUFBMask(C, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+ OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
@@ -2328,9 +2056,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPERMILPDZrm:
case X86::VPERMILPDZrmk:
case X86::VPERMILPDZrmkz: {
- if (!OutStreamer->isVerboseAsm())
- break;
- unsigned SrcIdx, MaskIdx;
unsigned ElSize;
switch (MI->getOpcode()) {
default: llvm_unreachable("Invalid opcode");
@@ -2339,33 +2064,42 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPERMILPSZ128rm:
case X86::VPERMILPSZ256rm:
case X86::VPERMILPSZrm:
- SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
case X86::VPERMILPSZ128rmkz:
case X86::VPERMILPSZ256rmkz:
case X86::VPERMILPSZrmkz:
- SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
case X86::VPERMILPSZ128rmk:
case X86::VPERMILPSZ256rmk:
case X86::VPERMILPSZrmk:
- SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
+ ElSize = 32;
+ break;
case X86::VPERMILPDrm:
case X86::VPERMILPDYrm:
case X86::VPERMILPDZ128rm:
case X86::VPERMILPDZ256rm:
case X86::VPERMILPDZrm:
- SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
case X86::VPERMILPDZ128rmkz:
case X86::VPERMILPDZ256rmkz:
case X86::VPERMILPDZrmkz:
- SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
case X86::VPERMILPDZ128rmk:
case X86::VPERMILPDZ256rmk:
case X86::VPERMILPDZrmk:
- SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
+ ElSize = 64;
+ break;
}
- assert(MI->getNumOperands() >= 6 &&
- "We should always have at least 6 operands!");
+ unsigned SrcIdx = 1;
+ if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+ // Skip mask operand.
+ ++SrcIdx;
+ if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+ // Skip passthru operand.
+ ++SrcIdx;
+ }
+ }
+ unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
+
+ assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
@@ -2373,7 +2107,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
SmallVector<int, 16> Mask;
DecodeVPERMILPMask(C, ElSize, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+ OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
@@ -2382,10 +2116,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPERMIL2PSrm:
case X86::VPERMIL2PDYrm:
case X86::VPERMIL2PSYrm: {
- if (!OutStreamer->isVerboseAsm())
- break;
- assert(MI->getNumOperands() >= 8 &&
- "We should always have at least 8 operands!");
+ assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands + 1) &&
+ "Unexpected number of operands!");
const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
if (!CtrlOp.isImm())
@@ -2398,47 +2130,43 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
}
- const MachineOperand &MaskOp = MI->getOperand(6);
+ const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
SmallVector<int, 16> Mask;
DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+ OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
case X86::VPPERMrrm: {
- if (!OutStreamer->isVerboseAsm())
- break;
- assert(MI->getNumOperands() >= 7 &&
- "We should always have at least 7 operands!");
+ assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
- const MachineOperand &MaskOp = MI->getOperand(6);
+ const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
SmallVector<int, 16> Mask;
DecodeVPPERMMask(C, Width, Mask);
if (!Mask.empty())
- OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+ OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
case X86::MMX_MOVQ64rm: {
- if (!OutStreamer->isVerboseAsm())
- break;
- if (MI->getNumOperands() <= 4)
- break;
- if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+ assert(MI->getNumOperands() == (1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
std::string Comment;
raw_string_ostream CS(Comment);
const MachineOperand &DstOp = MI->getOperand(0);
CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
if (auto *CF = dyn_cast<ConstantFP>(C)) {
CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
- OutStreamer->AddComment(CS.str());
+ OutStreamer.AddComment(CS.str());
}
}
break;
@@ -2489,11 +2217,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
case X86::VBROADCASTI64X2Z128rm:
case X86::VBROADCASTI64X2rm:
case X86::VBROADCASTI64X4rm:
- if (!OutStreamer->isVerboseAsm())
- break;
- if (MI->getNumOperands() <= 4)
- break;
- if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+ assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
int NumLanes = 1;
// Override NumLanes for the broadcast instructions.
switch (MI->getOpcode()) {
@@ -2535,7 +2261,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
CS << "]";
- OutStreamer->AddComment(CS.str());
+ OutStreamer.AddComment(CS.str());
} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
CS << "<";
for (int l = 0; l != NumLanes; ++l) {
@@ -2547,80 +2273,79 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
}
CS << ">";
- OutStreamer->AddComment(CS.str());
+ OutStreamer.AddComment(CS.str());
}
}
break;
+
case X86::MOVDDUPrm:
case X86::VMOVDDUPrm:
case X86::VMOVDDUPZ128rm:
case X86::VBROADCASTSSrm:
case X86::VBROADCASTSSYrm:
- case X86::VBROADCASTSSZ128m:
- case X86::VBROADCASTSSZ256m:
- case X86::VBROADCASTSSZm:
+ case X86::VBROADCASTSSZ128rm:
+ case X86::VBROADCASTSSZ256rm:
+ case X86::VBROADCASTSSZrm:
case X86::VBROADCASTSDYrm:
- case X86::VBROADCASTSDZ256m:
- case X86::VBROADCASTSDZm:
+ case X86::VBROADCASTSDZ256rm:
+ case X86::VBROADCASTSDZrm:
case X86::VPBROADCASTBrm:
case X86::VPBROADCASTBYrm:
- case X86::VPBROADCASTBZ128m:
- case X86::VPBROADCASTBZ256m:
- case X86::VPBROADCASTBZm:
+ case X86::VPBROADCASTBZ128rm:
+ case X86::VPBROADCASTBZ256rm:
+ case X86::VPBROADCASTBZrm:
case X86::VPBROADCASTDrm:
case X86::VPBROADCASTDYrm:
- case X86::VPBROADCASTDZ128m:
- case X86::VPBROADCASTDZ256m:
- case X86::VPBROADCASTDZm:
+ case X86::VPBROADCASTDZ128rm:
+ case X86::VPBROADCASTDZ256rm:
+ case X86::VPBROADCASTDZrm:
case X86::VPBROADCASTQrm:
case X86::VPBROADCASTQYrm:
- case X86::VPBROADCASTQZ128m:
- case X86::VPBROADCASTQZ256m:
- case X86::VPBROADCASTQZm:
+ case X86::VPBROADCASTQZ128rm:
+ case X86::VPBROADCASTQZ256rm:
+ case X86::VPBROADCASTQZrm:
case X86::VPBROADCASTWrm:
case X86::VPBROADCASTWYrm:
- case X86::VPBROADCASTWZ128m:
- case X86::VPBROADCASTWZ256m:
- case X86::VPBROADCASTWZm:
- if (!OutStreamer->isVerboseAsm())
- break;
- if (MI->getNumOperands() <= 4)
- break;
- if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+ case X86::VPBROADCASTWZ128rm:
+ case X86::VPBROADCASTWZ256rm:
+ case X86::VPBROADCASTWZrm:
+ assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+ if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
int NumElts;
switch (MI->getOpcode()) {
default: llvm_unreachable("Invalid opcode");
- case X86::MOVDDUPrm: NumElts = 2; break;
- case X86::VMOVDDUPrm: NumElts = 2; break;
- case X86::VMOVDDUPZ128rm: NumElts = 2; break;
- case X86::VBROADCASTSSrm: NumElts = 4; break;
- case X86::VBROADCASTSSYrm: NumElts = 8; break;
- case X86::VBROADCASTSSZ128m: NumElts = 4; break;
- case X86::VBROADCASTSSZ256m: NumElts = 8; break;
- case X86::VBROADCASTSSZm: NumElts = 16; break;
- case X86::VBROADCASTSDYrm: NumElts = 4; break;
- case X86::VBROADCASTSDZ256m: NumElts = 4; break;
- case X86::VBROADCASTSDZm: NumElts = 8; break;
- case X86::VPBROADCASTBrm: NumElts = 16; break;
- case X86::VPBROADCASTBYrm: NumElts = 32; break;
- case X86::VPBROADCASTBZ128m: NumElts = 16; break;
- case X86::VPBROADCASTBZ256m: NumElts = 32; break;
- case X86::VPBROADCASTBZm: NumElts = 64; break;
- case X86::VPBROADCASTDrm: NumElts = 4; break;
- case X86::VPBROADCASTDYrm: NumElts = 8; break;
- case X86::VPBROADCASTDZ128m: NumElts = 4; break;
- case X86::VPBROADCASTDZ256m: NumElts = 8; break;
- case X86::VPBROADCASTDZm: NumElts = 16; break;
- case X86::VPBROADCASTQrm: NumElts = 2; break;
- case X86::VPBROADCASTQYrm: NumElts = 4; break;
- case X86::VPBROADCASTQZ128m: NumElts = 2; break;
- case X86::VPBROADCASTQZ256m: NumElts = 4; break;
- case X86::VPBROADCASTQZm: NumElts = 8; break;
- case X86::VPBROADCASTWrm: NumElts = 8; break;
- case X86::VPBROADCASTWYrm: NumElts = 16; break;
- case X86::VPBROADCASTWZ128m: NumElts = 8; break;
- case X86::VPBROADCASTWZ256m: NumElts = 16; break;
- case X86::VPBROADCASTWZm: NumElts = 32; break;
+ case X86::MOVDDUPrm: NumElts = 2; break;
+ case X86::VMOVDDUPrm: NumElts = 2; break;
+ case X86::VMOVDDUPZ128rm: NumElts = 2; break;
+ case X86::VBROADCASTSSrm: NumElts = 4; break;
+ case X86::VBROADCASTSSYrm: NumElts = 8; break;
+ case X86::VBROADCASTSSZ128rm: NumElts = 4; break;
+ case X86::VBROADCASTSSZ256rm: NumElts = 8; break;
+ case X86::VBROADCASTSSZrm: NumElts = 16; break;
+ case X86::VBROADCASTSDYrm: NumElts = 4; break;
+ case X86::VBROADCASTSDZ256rm: NumElts = 4; break;
+ case X86::VBROADCASTSDZrm: NumElts = 8; break;
+ case X86::VPBROADCASTBrm: NumElts = 16; break;
+ case X86::VPBROADCASTBYrm: NumElts = 32; break;
+ case X86::VPBROADCASTBZ128rm: NumElts = 16; break;
+ case X86::VPBROADCASTBZ256rm: NumElts = 32; break;
+ case X86::VPBROADCASTBZrm: NumElts = 64; break;
+ case X86::VPBROADCASTDrm: NumElts = 4; break;
+ case X86::VPBROADCASTDYrm: NumElts = 8; break;
+ case X86::VPBROADCASTDZ128rm: NumElts = 4; break;
+ case X86::VPBROADCASTDZ256rm: NumElts = 8; break;
+ case X86::VPBROADCASTDZrm: NumElts = 16; break;
+ case X86::VPBROADCASTQrm: NumElts = 2; break;
+ case X86::VPBROADCASTQYrm: NumElts = 4; break;
+ case X86::VPBROADCASTQZ128rm: NumElts = 2; break;
+ case X86::VPBROADCASTQZ256rm: NumElts = 4; break;
+ case X86::VPBROADCASTQZrm: NumElts = 8; break;
+ case X86::VPBROADCASTWrm: NumElts = 8; break;
+ case X86::VPBROADCASTWYrm: NumElts = 16; break;
+ case X86::VPBROADCASTWZ128rm: NumElts = 8; break;
+ case X86::VPBROADCASTWZ256rm: NumElts = 16; break;
+ case X86::VPBROADCASTWZrm: NumElts = 32; break;
}
std::string Comment;
@@ -2634,8 +2359,241 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
printConstant(C, CS);
}
CS << "]";
- OutStreamer->AddComment(CS.str());
+ OutStreamer.AddComment(CS.str());
+ }
+ }
+}
+
+void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
+ X86MCInstLower MCInstLowering(*MF, *this);
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
+ // are compressed from EVEX encoding to VEX encoding.
+ if (TM.Options.MCOptions.ShowMCEncoding) {
+ if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
+ OutStreamer->AddComment("EVEX TO VEX Compression ", false);
+ }
+
+ // Add comments for values loaded from constant pool.
+ if (OutStreamer->isVerboseAsm())
+ addConstantComments(MI, *OutStreamer);
+
+ switch (MI->getOpcode()) {
+ case TargetOpcode::DBG_VALUE:
+ llvm_unreachable("Should be handled target independently");
+
+ // Emit nothing here but a comment if we can.
+ case X86::Int_MemBarrier:
+ OutStreamer->emitRawComment("MEMBARRIER");
+ return;
+
+ case X86::EH_RETURN:
+ case X86::EH_RETURN64: {
+ // Lower these as normal, but add some comments.
+ Register Reg = MI->getOperand(0).getReg();
+ OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+ X86ATTInstPrinter::getRegisterName(Reg));
+ break;
+ }
+ case X86::CLEANUPRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CLEANUPRET");
+ break;
+ }
+
+ case X86::CATCHRET: {
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("CATCHRET");
+ break;
+ }
+
+ case X86::ENDBR32:
+ case X86::ENDBR64: {
+ // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
+ // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
+ // non-empty. If MI is the initial ENDBR, place the
+ // __patchable_function_entries label after ENDBR.
+ if (CurrentPatchableFunctionEntrySym &&
+ CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
+ MI == &MF->front().front()) {
+ MCInst Inst;
+ MCInstLowering.Lower(MI, Inst);
+ EmitAndCountInstruction(Inst);
+ CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
+ OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+ return;
}
+ break;
+ }
+
+ case X86::TAILJMPr:
+ case X86::TAILJMPm:
+ case X86::TAILJMPd:
+ case X86::TAILJMPd_CC:
+ case X86::TAILJMPr64:
+ case X86::TAILJMPm64:
+ case X86::TAILJMPd64:
+ case X86::TAILJMPd64_CC:
+ case X86::TAILJMPr64_REX:
+ case X86::TAILJMPm64_REX:
+ // Lower these as normal, but add some comments.
+ OutStreamer->AddComment("TAILCALL");
+ break;
+
+ case X86::TLS_addr32:
+ case X86::TLS_addr64:
+ case X86::TLS_base_addr32:
+ case X86::TLS_base_addr64:
+ return LowerTlsAddr(MCInstLowering, *MI);
+
+ case X86::MOVPC32r: {
+ // This is a pseudo op for a two instruction sequence with a label, which
+ // looks like:
+ // call "L1$pb"
+ // "L1$pb":
+ // popl %esi
+
+ // Emit the call.
+ MCSymbol *PICBase = MF->getPICBaseSymbol();
+ // FIXME: We would like an efficient form for this, so we don't have to do a
+ // lot of extra uniquing.
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::CALLpcrel32)
+ .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+ const X86FrameLowering *FrameLowering =
+ MF->getSubtarget<X86Subtarget>().getFrameLowering();
+ bool hasFP = FrameLowering->hasFP(*MF);
+
+ // TODO: This is needed only if we require precise CFA.
+ bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+ !OutStreamer->getDwarfFrameInfos().back().End;
+
+ int stackGrowth = -RI->getSlotSize();
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->emitCFIAdjustCfaOffset(-stackGrowth);
+ }
+
+ // Emit the label.
+ OutStreamer->emitLabel(PICBase);
+
+ // popl $reg
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
+
+ if (HasActiveDwarfFrame && !hasFP) {
+ OutStreamer->emitCFIAdjustCfaOffset(stackGrowth);
+ }
+ return;
+ }
+
+ case X86::ADD32ri: {
+ // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+ if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+ break;
+
+ // Okay, we have something like:
+ // EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+ // For this, we want to print something like:
+ // MYGLOBAL + (. - PICBASE)
+ // However, we can't generate a ".", so just emit a new label here and refer
+ // to it.
+ MCSymbol *DotSym = OutContext.createTempSymbol();
+ OutStreamer->emitLabel(DotSym);
+
+ // Now that we have emitted the label, lower the complex operand expression.
+ MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+ const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+ const MCExpr *PICBase =
+ MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+ DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+ DotExpr = MCBinaryExpr::createAdd(
+ MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
+
+ EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+ .addReg(MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg())
+ .addExpr(DotExpr));
+ return;
+ }
+ case TargetOpcode::STATEPOINT:
+ return LowerSTATEPOINT(*MI, MCInstLowering);
+
+ case TargetOpcode::FAULTING_OP:
+ return LowerFAULTING_OP(*MI, MCInstLowering);
+
+ case TargetOpcode::FENTRY_CALL:
+ return LowerFENTRY_CALL(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_OP:
+ return LowerPATCHABLE_OP(*MI, MCInstLowering);
+
+ case TargetOpcode::STACKMAP:
+ return LowerSTACKMAP(*MI);
+
+ case TargetOpcode::PATCHPOINT:
+ return LowerPATCHPOINT(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+ return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_RET:
+ return LowerPATCHABLE_RET(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_TAIL_CALL:
+ return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_EVENT_CALL:
+ return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
+
+ case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+ return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
+
+ case X86::MORESTACK_RET:
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ return;
+
+ case X86::MORESTACK_RET_RESTORE_R10:
+ // Return, then restore R10.
+ EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
+ return;
+
+ case X86::SEH_PushReg:
+ case X86::SEH_SaveReg:
+ case X86::SEH_SaveXMM:
+ case X86::SEH_StackAlloc:
+ case X86::SEH_StackAlign:
+ case X86::SEH_SetFrame:
+ case X86::SEH_PushFrame:
+ case X86::SEH_EndPrologue:
+ EmitSEHInstruction(MI);
+ return;
+
+ case X86::SEH_Epilogue: {
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ MachineBasicBlock::const_iterator MBBI(MI);
+ // Check if preceded by a call and emit nop if so.
+ for (MBBI = PrevCrossBBInst(MBBI);
+ MBBI != MachineBasicBlock::const_iterator();
+ MBBI = PrevCrossBBInst(MBBI)) {
+ // Conservatively assume that pseudo instructions don't emit code and keep
+ // looking for a call. We may emit an unnecessary nop in some cases.
+ if (!MBBI->isPseudo()) {
+ if (MBBI->isCall())
+ EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+ break;
+ }
+ }
+ return;
+ }
}
MCInst TmpInst;
@@ -2652,7 +2610,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
// after it.
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
// Then emit the call
- OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
+ OutStreamer->emitInstruction(TmpInst, getSubtargetInfo());
return;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 5cb80a082b56..eedad952c3b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -13,9 +13,10 @@
#ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Support/MachineValueType.h"
namespace llvm {
@@ -62,12 +63,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// SRetReturnReg - Some subtargets require that sret lowering includes
/// returning the value of the returned struct in a register. This field
/// holds the virtual register into which the sret argument is passed.
- unsigned SRetReturnReg = 0;
+ Register SRetReturnReg;
/// GlobalBaseReg - keeps track of the virtual register initialized for
/// use as the global base register. This is used for PIC in some PIC
/// relocation models.
- unsigned GlobalBaseReg = 0;
+ Register GlobalBaseReg;
/// VarArgsFrameIndex - FrameIndex for start of varargs area.
int VarArgsFrameIndex = 0;
@@ -104,6 +105,13 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// True if this function has WIN_ALLOCA instructions.
bool HasWinAlloca = false;
+ /// True if this function has any preallocated calls.
+ bool HasPreallocatedCall = false;
+
+ ValueMap<const Value *, size_t> PreallocatedIds;
+ SmallVector<size_t, 0> PreallocatedStackSizes;
+ SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
+
private:
/// ForwardedMustTailRegParms - A list of virtual and physical registers
/// that must be forwarded to every musttail call.
@@ -143,11 +151,11 @@ public:
int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
- unsigned getSRetReturnReg() const { return SRetReturnReg; }
- void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+ Register getSRetReturnReg() const { return SRetReturnReg; }
+ void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; }
- unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
- void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+ Register getGlobalBaseReg() const { return GlobalBaseReg; }
+ void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
@@ -185,6 +193,36 @@ public:
bool hasWinAlloca() const { return HasWinAlloca; }
void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+
+ bool hasPreallocatedCall() const { return HasPreallocatedCall; }
+ void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; }
+
+ size_t getPreallocatedIdForCallSite(const Value *CS) {
+ auto Insert = PreallocatedIds.insert({CS, PreallocatedIds.size()});
+ if (Insert.second) {
+ PreallocatedStackSizes.push_back(0);
+ PreallocatedArgOffsets.emplace_back();
+ }
+ return Insert.first->second;
+ }
+
+ void setPreallocatedStackSize(size_t Id, size_t StackSize) {
+ PreallocatedStackSizes[Id] = StackSize;
+ }
+
+ size_t getPreallocatedStackSize(const size_t Id) {
+ assert(PreallocatedStackSizes[Id] != 0 && "stack size not set");
+ return PreallocatedStackSizes[Id];
+ }
+
+ void setPreallocatedArgOffsets(size_t Id, ArrayRef<size_t> AO) {
+ PreallocatedArgOffsets[Id].assign(AO.begin(), AO.end());
+ }
+
+ const ArrayRef<size_t> getPreallocatedArgOffsets(const size_t Id) {
+ assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set");
+ return PreallocatedArgOffsets[Id];
+ }
};
} // End llvm namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp
index b19d1263e0c9..425054cfdd92 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/X86BaseInfo.h"
#include "X86MacroFusion.h"
+#include "MCTargetDesc/X86BaseInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/MacroFusion.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h
index d4ae54f657a5..05388b275ca3 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MacroFusion.h
@@ -14,10 +14,12 @@
#ifndef LLVM_LIB_TARGET_X86_X86MACROFUSION_H
#define LLVM_LIB_TARGET_X86_X86MACROFUSION_H
-#include "llvm/CodeGen/MachineScheduler.h"
+#include <memory>
namespace llvm {
+class ScheduleDAGMutation;
+
/// Note that you have to add:
/// DAG.addMutation(createX86MacroFusionDAGMutation());
/// to X86PassConfig::createMachineScheduler() to have an effect.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 0c791b6674dc..c8899a85118e 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -578,7 +578,7 @@ bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
unsigned VReg,
int64_t AddrDispShift) {
- DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
+ const DIExpression *Expr = MI.getDebugExpression();
if (AddrDispShift != 0)
Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 4c6bd0ccc2cd..ec81b07f9e5f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -58,6 +58,7 @@ namespace {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ AU.addPreserved<LazyMachineBlockFrequencyInfoPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
new file mode 100644
index 000000000000..8784a3df1773
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -0,0 +1,490 @@
+//===-- X86PartialReduction.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for add instructions used by a horizontal reduction to see
+// if we might be able to use pmaddwd or psadbw. Some cases of this require
+// cross basic block knowledge and can't be done in SelectionDAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-partial-reduction"
+
+namespace {
+
+class X86PartialReduction : public FunctionPass {
+ const DataLayout *DL;
+ const X86Subtarget *ST;
+
+public:
+ static char ID; // Pass identification, replacement for typeid.
+
+ X86PartialReduction() : FunctionPass(ID) { }
+
+ bool runOnFunction(Function &Fn) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Partial Reduction";
+ }
+
+private:
+ bool tryMAddReplacement(Instruction *Op);
+ bool trySADReplacement(Instruction *Op);
+};
+}
+
+FunctionPass *llvm::createX86PartialReductionPass() {
+ return new X86PartialReduction();
+}
+
+char X86PartialReduction::ID = 0;
+
+INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
+ "X86 Partial Reduction", false, false)
+
+bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
+ if (!ST->hasSSE2())
+ return false;
+
+ // Need at least 8 elements.
+ if (cast<FixedVectorType>(Op->getType())->getNumElements() < 8)
+ return false;
+
+ // Element type should be i32.
+ if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
+ return false;
+
+ auto *Mul = dyn_cast<BinaryOperator>(Op);
+ if (!Mul || Mul->getOpcode() != Instruction::Mul)
+ return false;
+
+ Value *LHS = Mul->getOperand(0);
+ Value *RHS = Mul->getOperand(1);
+
+ // LHS and RHS should be only used once or if they are the same then only
+ // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
+ // instructions, otherwise we use punpck to emulate zero extend in stages. The
+ // trunc/ we need to do likely won't introduce new instructions in that case.
+ if (ST->hasSSE41()) {
+ if (LHS == RHS) {
+ if (!isa<Constant>(LHS) && !LHS->hasNUses(2))
+ return false;
+ } else {
+ if (!isa<Constant>(LHS) && !LHS->hasOneUse())
+ return false;
+ if (!isa<Constant>(RHS) && !RHS->hasOneUse())
+ return false;
+ }
+ }
+
+ auto CanShrinkOp = [&](Value *Op) {
+ auto IsFreeTruncation = [&](Value *Op) {
+ if (auto *Cast = dyn_cast<CastInst>(Op)) {
+ if (Cast->getParent() == Mul->getParent() &&
+ (Cast->getOpcode() == Instruction::SExt ||
+ Cast->getOpcode() == Instruction::ZExt) &&
+ Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16)
+ return true;
+ }
+
+ return isa<Constant>(Op);
+ };
+
+ // If the operation can be freely truncated and has enough sign bits we
+ // can shrink.
+ if (IsFreeTruncation(Op) &&
+ ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
+ return true;
+
+ // SelectionDAG has limited support for truncating through an add or sub if
+ // the inputs are freely truncatable.
+ if (auto *BO = dyn_cast<BinaryOperator>(Op)) {
+ if (BO->getParent() == Mul->getParent() &&
+ IsFreeTruncation(BO->getOperand(0)) &&
+ IsFreeTruncation(BO->getOperand(1)) &&
+ ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
+ return true;
+ }
+
+ return false;
+ };
+
+ // Both Ops need to be shrinkable.
+ if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS))
+ return false;
+
+ IRBuilder<> Builder(Mul);
+
+ auto *MulTy = cast<FixedVectorType>(Op->getType());
+ unsigned NumElts = MulTy->getNumElements();
+
+ // Extract even elements and odd elements and add them together. This will
+ // be pattern matched by SelectionDAG to pmaddwd. This instruction will be
+ // half the original width.
+ SmallVector<int, 16> EvenMask(NumElts / 2);
+ SmallVector<int, 16> OddMask(NumElts / 2);
+ for (int i = 0, e = NumElts / 2; i != e; ++i) {
+ EvenMask[i] = i * 2;
+ OddMask[i] = i * 2 + 1;
+ }
+ // Creating a new mul so the replaceAllUsesWith below doesn't replace the
+ // uses in the shuffles we're creating.
+ Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1));
+ Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask);
+ Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask);
+ Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
+
+ // Concatenate zeroes to extend back to the original type.
+ SmallVector<int, 32> ConcatMask(NumElts);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Value *Zero = Constant::getNullValue(MAdd->getType());
+ Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
+
+ Mul->replaceAllUsesWith(Concat);
+ Mul->eraseFromParent();
+
+ return true;
+}
+
+bool X86PartialReduction::trySADReplacement(Instruction *Op) {
+ if (!ST->hasSSE2())
+ return false;
+
+ // TODO: There's nothing special about i32, any integer type above i16 should
+ // work just as well.
+ if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
+ return false;
+
+ // Operand should be a select.
+ auto *SI = dyn_cast<SelectInst>(Op);
+ if (!SI)
+ return false;
+
+ // Select needs to implement absolute value.
+ Value *LHS, *RHS;
+ auto SPR = matchSelectPattern(SI, LHS, RHS);
+ if (SPR.Flavor != SPF_ABS)
+ return false;
+
+ // Need a subtract of two values.
+ auto *Sub = dyn_cast<BinaryOperator>(LHS);
+ if (!Sub || Sub->getOpcode() != Instruction::Sub)
+ return false;
+
+ // Look for zero extend from i8.
+ auto getZeroExtendedVal = [](Value *Op) -> Value * {
+ if (auto *ZExt = dyn_cast<ZExtInst>(Op))
+ if (cast<VectorType>(ZExt->getOperand(0)->getType())
+ ->getElementType()
+ ->isIntegerTy(8))
+ return ZExt->getOperand(0);
+
+ return nullptr;
+ };
+
+ // Both operands of the subtract should be extends from vXi8.
+ Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
+ Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
+ if (!Op0 || !Op1)
+ return false;
+
+ IRBuilder<> Builder(SI);
+
+ auto *OpTy = cast<FixedVectorType>(Op->getType());
+ unsigned NumElts = OpTy->getNumElements();
+
+ unsigned IntrinsicNumElts;
+ Intrinsic::ID IID;
+ if (ST->hasBWI() && NumElts >= 64) {
+ IID = Intrinsic::x86_avx512_psad_bw_512;
+ IntrinsicNumElts = 64;
+ } else if (ST->hasAVX2() && NumElts >= 32) {
+ IID = Intrinsic::x86_avx2_psad_bw;
+ IntrinsicNumElts = 32;
+ } else {
+ IID = Intrinsic::x86_sse2_psad_bw;
+ IntrinsicNumElts = 16;
+ }
+
+ Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID);
+
+ if (NumElts < 16) {
+ // Pad input with zeroes.
+ SmallVector<int, 32> ConcatMask(16);
+ for (unsigned i = 0; i != NumElts; ++i)
+ ConcatMask[i] = i;
+ for (unsigned i = NumElts; i != 16; ++i)
+ ConcatMask[i] = (i % NumElts) + NumElts;
+
+ Value *Zero = Constant::getNullValue(Op0->getType());
+ Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
+ Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
+ NumElts = 16;
+ }
+
+ // Intrinsics produce vXi64 and need to be casted to vXi32.
+ auto *I32Ty =
+ FixedVectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+
+ assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
+ unsigned NumSplits = NumElts / IntrinsicNumElts;
+
+ // First collect the pieces we need.
+ SmallVector<Value *, 4> Ops(NumSplits);
+ for (unsigned i = 0; i != NumSplits; ++i) {
+ SmallVector<int, 64> ExtractMask(IntrinsicNumElts);
+ std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts);
+ Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
+ Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
+ Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
+ Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty);
+ }
+
+ assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits");
+ unsigned Stages = Log2_32(NumSplits);
+ for (unsigned s = Stages; s > 0; --s) {
+ unsigned NumConcatElts =
+ cast<FixedVectorType>(Ops[0]->getType())->getNumElements() * 2;
+ for (unsigned i = 0; i != 1U << (s - 1); ++i) {
+ SmallVector<int, 64> ConcatMask(NumConcatElts);
+ std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+ Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask);
+ }
+ }
+
+ // At this point the final value should be in Ops[0]. Now we need to adjust
+ // it to the final original type.
+ NumElts = cast<FixedVectorType>(OpTy)->getNumElements();
+ if (NumElts == 2) {
+ // Extract down to 2 elements.
+ Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{0, 1});
+ } else if (NumElts >= 8) {
+ SmallVector<int, 32> ConcatMask(NumElts);
+ unsigned SubElts =
+ cast<FixedVectorType>(Ops[0]->getType())->getNumElements();
+ for (unsigned i = 0; i != SubElts; ++i)
+ ConcatMask[i] = i;
+ for (unsigned i = SubElts; i != NumElts; ++i)
+ ConcatMask[i] = (i % SubElts) + SubElts;
+
+ Value *Zero = Constant::getNullValue(Ops[0]->getType());
+ Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
+ }
+
+ SI->replaceAllUsesWith(Ops[0]);
+ SI->eraseFromParent();
+
+ return true;
+}
+
+// Walk backwards from the ExtractElementInst and determine if it is the end of
+// a horizontal reduction. Return the input to the reduction if we find one.
+static Value *matchAddReduction(const ExtractElementInst &EE) {
+ // Make sure we're extracting index 0.
+ auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand());
+ if (!Index || !Index->isNullValue())
+ return nullptr;
+
+ const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand());
+ if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse())
+ return nullptr;
+
+ unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements();
+ // Ensure the reduction size is a power of 2.
+ if (!isPowerOf2_32(NumElems))
+ return nullptr;
+
+ const Value *Op = BO;
+ unsigned Stages = Log2_32(NumElems);
+ for (unsigned i = 0; i != Stages; ++i) {
+ const auto *BO = dyn_cast<BinaryOperator>(Op);
+ if (!BO || BO->getOpcode() != Instruction::Add)
+ return nullptr;
+
+ // If this isn't the first add, then it should only have 2 users, the
+ // shuffle and another add which we checked in the previous iteration.
+ if (i != 0 && !BO->hasNUses(2))
+ return nullptr;
+
+ Value *LHS = BO->getOperand(0);
+ Value *RHS = BO->getOperand(1);
+
+ auto *Shuffle = dyn_cast<ShuffleVectorInst>(LHS);
+ if (Shuffle) {
+ Op = RHS;
+ } else {
+ Shuffle = dyn_cast<ShuffleVectorInst>(RHS);
+ Op = LHS;
+ }
+
+ // The first operand of the shuffle should be the same as the other operand
+ // of the bin op.
+ if (!Shuffle || Shuffle->getOperand(0) != Op)
+ return nullptr;
+
+ // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+ unsigned MaskEnd = 1 << i;
+ for (unsigned Index = 0; Index < MaskEnd; ++Index)
+ if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index))
+ return nullptr;
+ }
+
+ return const_cast<Value *>(Op);
+}
+
+// See if this BO is reachable from this Phi by walking forward through single
+// use BinaryOperators with the same opcode. If we get back then we know we've
+// found a loop and it is safe to step through this Add to find more leaves.
+static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) {
+ // The PHI itself should only have one use.
+ if (!Phi->hasOneUse())
+ return false;
+
+ Instruction *U = cast<Instruction>(*Phi->user_begin());
+ if (U == BO)
+ return true;
+
+ while (U->hasOneUse() && U->getOpcode() == BO->getOpcode())
+ U = cast<Instruction>(*U->user_begin());
+
+ return U == BO;
+}
+
+// Collect all the leaves of the tree of adds that feeds into the horizontal
+// reduction. Root is the Value that is used by the horizontal reduction.
+// We look through single use phis, single use adds, or adds that are used by
+// a phi that forms a loop with the add.
+static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
+ SmallPtrSet<Value *, 8> Visited;
+ SmallVector<Value *, 8> Worklist;
+ Worklist.push_back(Root);
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.pop_back_val();
+ if (!Visited.insert(V).second)
+ continue;
+
+ if (auto *PN = dyn_cast<PHINode>(V)) {
+ // PHI node should have single use unless it is the root node, then it
+ // has 2 uses.
+ if (!PN->hasNUses(PN == Root ? 2 : 1))
+ break;
+
+ // Push incoming values to the worklist.
+ for (Value *InV : PN->incoming_values())
+ Worklist.push_back(InV);
+
+ continue;
+ }
+
+ if (auto *BO = dyn_cast<BinaryOperator>(V)) {
+ if (BO->getOpcode() == Instruction::Add) {
+ // Simple case. Single use, just push its operands to the worklist.
+ if (BO->hasNUses(BO == Root ? 2 : 1)) {
+ for (Value *Op : BO->operands())
+ Worklist.push_back(Op);
+ continue;
+ }
+
+ // If there is additional use, make sure it is an unvisited phi that
+ // gets us back to this node.
+ if (BO->hasNUses(BO == Root ? 3 : 2)) {
+ PHINode *PN = nullptr;
+ for (auto *U : Root->users())
+ if (auto *P = dyn_cast<PHINode>(U))
+ if (!Visited.count(P))
+ PN = P;
+
+ // If we didn't find a 2-input PHI then this isn't a case we can
+ // handle.
+ if (!PN || PN->getNumIncomingValues() != 2)
+ continue;
+
+ // Walk forward from this phi to see if it reaches back to this add.
+ if (!isReachableFromPHI(PN, BO))
+ continue;
+
+ // The phi forms a loop with this Add, push its operands.
+ for (Value *Op : BO->operands())
+ Worklist.push_back(Op);
+ }
+ }
+ }
+
+ // Not an add or phi, make it a leaf.
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ if (!V->hasNUses(I == Root ? 2 : 1))
+ continue;
+
+ // Add this as a leaf.
+ Leaves.push_back(I);
+ }
+ }
+}
+
+bool X86PartialReduction::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+ if (!TPC)
+ return false;
+
+ auto &TM = TPC->getTM<X86TargetMachine>();
+ ST = TM.getSubtargetImpl(F);
+
+ DL = &F.getParent()->getDataLayout();
+
+ bool MadeChange = false;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ auto *EE = dyn_cast<ExtractElementInst>(&I);
+ if (!EE)
+ continue;
+
+ // First find a reduction tree.
+ // FIXME: Do we need to handle other opcodes than Add?
+ Value *Root = matchAddReduction(*EE);
+ if (!Root)
+ continue;
+
+ SmallVector<Instruction *, 8> Leaves;
+ collectLeaves(Root, Leaves);
+
+ for (Instruction *I : Leaves) {
+ if (tryMAddReplacement(I)) {
+ MadeChange = true;
+ continue;
+ }
+
+ // Don't do SAD matching on the root node. SelectionDAG already
+ // has support for that and currently generates better code.
+ if (I != Root && trySADReplacement(I))
+ MadeChange = true;
+ }
+ }
+ }
+
+ return MadeChange;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td b/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
index 93238983afa2..833013fb69f3 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86PfmCounters.td
@@ -223,3 +223,13 @@ def ZnVer1PfmCounters : ProcPfmCounters {
];
}
def : PfmCountersBinding<"znver1", ZnVer1PfmCounters>;
+
+def ZnVer2PfmCounters : ProcPfmCounters {
+ let CycleCounter = PfmCounter<"cycles_not_in_halt">;
+ let UopsCounter = PfmCounter<"retired_uops">;
+ let IssueCounters = [
+ PfmIssueCounter<"Zn2AGU", "ls_dispatch:ld_dispatch + ls_dispatch:store_dispatch">,
+ PfmIssueCounter<"Zn2Divider", "div_op_count">
+ ];
+}
+def : PfmCountersBinding<"znver2", ZnVer2PfmCounters>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
index f69626b2622e..f456728cf47b 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -72,12 +72,6 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
}
}
-bool
-X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
- // ExecutionDomainFix, BreakFalseDeps and PostRAScheduler require liveness.
- return true;
-}
-
int
X86RegisterInfo::getSEHRegNum(unsigned i) const {
return getEncodingValue(i);
@@ -633,18 +627,22 @@ static bool CantUseSP(const MachineFrameInfo &MFI) {
}
bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
- const MachineFrameInfo &MFI = MF.getFrameInfo();
-
- if (!EnableBasePointer)
- return false;
-
- // When we need stack realignment, we can't address the stack from the frame
- // pointer. When we have dynamic allocas or stack-adjusting inline asm, we
- // can't address variables from the stack pointer. MS inline asm can
- // reference locals while also adjusting the stack pointer. When we can't
- // use both the SP and the FP, we need a separate base pointer register.
- bool CantUseFP = needsStackRealignment(MF);
- return CantUseFP && CantUseSP(MFI);
+ const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ if (X86FI->hasPreallocatedCall())
+ return true;
+
+ const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+ if (!EnableBasePointer)
+ return false;
+
+ // When we need stack realignment, we can't address the stack from the frame
+ // pointer. When we have dynamic allocas or stack-adjusting inline asm, we
+ // can't address variables from the stack pointer. MS inline asm can
+ // reference locals while also adjusting the stack pointer. When we can't
+ // use both the SP and the FP, we need a separate base pointer register.
+ bool CantUseFP = needsStackRealignment(MF);
+ return CantUseFP && CantUseSP(MFI);
}
bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
@@ -667,7 +665,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
}
bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
- unsigned Reg, int &FrameIdx) const {
+ Register Reg, int &FrameIdx) const {
// Since X86 defines assignCalleeSavedSpillSlots which always return true
// this function neither used nor tested.
llvm_unreachable("Unused function on X86. Otherwise need a test case.");
@@ -728,7 +726,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// Determine base register and offset.
int FIOffset;
- unsigned BasePtr;
+ Register BasePtr;
if (MI.isReturn()) {
assert((!needsStackRealignment(MF) ||
MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
index b82920898069..3435c0a10b04 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -54,10 +54,6 @@ public:
// FIXME: This should be tablegen'd like getDwarfRegNum is
int getSEHRegNum(unsigned i) const;
- /// Code Generation virtual methods...
- ///
- bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
/// getMatchingSuperRegClass - Return a subclass of the specified register
/// class A so that each register in it has a sub-register of the
/// specified sub-register index which is in the specified register class B.
@@ -125,7 +121,7 @@ public:
bool canRealignStack(const MachineFunction &MF) const override;
- bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+ bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
int &FrameIdx) const override;
void eliminateFrameIndex(MachineBasicBlock::iterator MI,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
index 3cfaf714e93e..8de5b94bbffa 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -265,6 +265,16 @@ let SubRegIndices = [sub_ymm] in {
}
}
+// Tile "registers".
+def TMM0: X86Reg<"tmm0", 0>;
+def TMM1: X86Reg<"tmm1", 1>;
+def TMM2: X86Reg<"tmm2", 2>;
+def TMM3: X86Reg<"tmm3", 3>;
+def TMM4: X86Reg<"tmm4", 4>;
+def TMM5: X86Reg<"tmm5", 5>;
+def TMM6: X86Reg<"tmm6", 6>;
+def TMM7: X86Reg<"tmm7", 7>;
+
// Mask Registers, used by AVX-512 instructions.
def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, 93, 93]>;
def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, 94, 94]>;
@@ -498,7 +508,7 @@ def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
// which we do not have right now.
def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
-// When RBP is used as a base pointer in a 32-bit addresses environement,
+// When RBP is used as a base pointer in a 32-bit addresses environment,
// this is also safe to use the full register to access addresses.
// Since RBP will never be spilled, stick to a 32 alignment to save
// on memory consumption.
@@ -621,3 +631,8 @@ def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
// Bound registers
def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
+
+// Tiles
+let isAllocatable = 0 in
+def TILE : RegisterClass<"X86", [untyped], 0,
+ (sequence "TMM%u", 0, 7)> {let Size = 8192;}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
index 9b1fcaa8a13d..4aea7bc253bb 100755
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -260,7 +260,8 @@ defm : BWWriteResPair<WriteFCmp64X, [BWPort1], 3, [1], 1, 5>; // Floating point
defm : BWWriteResPair<WriteFCmp64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double compare (YMM/ZMM).
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
-defm : BWWriteResPair<WriteFCom, [BWPort1], 3>; // Floating point compare to flags.
+defm : BWWriteResPair<WriteFCom, [BWPort1], 3>; // Floating point compare to flags (X87).
+defm : BWWriteResPair<WriteFComX, [BWPort1], 3>; // Floating point compare to flags (SSE).
defm : BWWriteResPair<WriteFMul, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication.
defm : BWWriteResPair<WriteFMulX, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication (XMM).
@@ -351,8 +352,10 @@ defm : X86WriteRes<WriteVecStoreX, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveX, [BWPort015], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveY, [BWPort015], 1, [1], 1>;
@@ -986,7 +989,7 @@ def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[BWWriteResGroup62], (instrs FARJMP64)>;
+def: InstRW<[BWWriteResGroup62], (instrs FARJMP64m)>;
def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>;
def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> {
@@ -1127,7 +1130,7 @@ def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort
let ResourceCycles = [1,1,1,1,1];
}
def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup89], (instrs FARCALL64)>;
+def: InstRW<[BWWriteResGroup89], (instrs FARCALL64m)>;
def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> {
let Latency = 7;
@@ -1479,54 +1482,42 @@ def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI(16|32)m")>;
def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
- let Latency = 22;
+ let Latency = 17;
let NumMicroOps = 7;
let ResourceCycles = [1,3,2,1];
}
-def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERQPDrm)>;
+def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERDPDrm, VPGATHERDQrm,
+ VGATHERQPDrm, VPGATHERQQrm)>;
def BWWriteResGroup183_2 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
- let Latency = 23;
+ let Latency = 18;
let NumMicroOps = 9;
let ResourceCycles = [1,3,4,1];
}
-def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERQPDYrm)>;
+def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+ VGATHERQPDYrm, VPGATHERQQYrm)>;
def BWWriteResGroup183_3 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
- let Latency = 24;
+ let Latency = 19;
let NumMicroOps = 9;
let ResourceCycles = [1,5,2,1];
}
-def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSYrm)>;
+def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSrm, VPGATHERQDrm)>;
def BWWriteResGroup183_4 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
- let Latency = 25;
- let NumMicroOps = 7;
- let ResourceCycles = [1,3,2,1];
+ let Latency = 19;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,4,4,1];
}
-def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPDrm,
- VGATHERDPSrm)>;
+def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPSrm, VPGATHERDDrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
def BWWriteResGroup183_5 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
- let Latency = 26;
- let NumMicroOps = 9;
- let ResourceCycles = [1,5,2,1];
-}
-def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>;
-
-def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
- let Latency = 26;
+ let Latency = 21;
let NumMicroOps = 14;
let ResourceCycles = [1,4,8,1];
}
-def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>;
-
-def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
- let Latency = 27;
- let NumMicroOps = 9;
- let ResourceCycles = [1,5,2,1];
-}
-def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>;
+def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
let Latency = 29;
@@ -1604,7 +1595,7 @@ def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>;
def: InstRW<[WriteZero], (instrs CLC)>;
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
// ports in certain conditions.
// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
// section "Haswell and Broadwell Pipeline" > "Register allocation and
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
index 06f417501b21..746dbaeca189 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -261,6 +261,7 @@ defm : HWWriteResPair<WriteFCmp64Y, [HWPort1], 3, [1], 1, 7>;
defm : HWWriteResPair<WriteFCmp64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1
defm : HWWriteResPair<WriteFCom, [HWPort1], 3>;
+defm : HWWriteResPair<WriteFComX, [HWPort1], 3>;
defm : HWWriteResPair<WriteFMul, [HWPort01], 5, [1], 1, 5>;
defm : HWWriteResPair<WriteFMulX, [HWPort01], 5, [1], 1, 6>;
@@ -391,8 +392,10 @@ defm : X86WriteRes<WriteVecStoreX, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveX, [HWPort015], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveY, [HWPort015], 1, [1], 1>;
@@ -996,7 +999,7 @@ def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[HWWriteResGroup14], (instrs FARJMP64)>;
+def: InstRW<[HWWriteResGroup14], (instrs FARJMP64m)>;
def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>;
def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> {
@@ -1205,7 +1208,7 @@ def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort
let ResourceCycles = [1,1,1,1,1];
}
def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup48], (instrs FARCALL64)>;
+def: InstRW<[HWWriteResGroup48], (instrs FARCALL64m)>;
def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
let Latency = 3;
@@ -1784,80 +1787,60 @@ def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,
}
def: InstRW<[HWWriteResGroup183], (instrs FSTENVm)>;
-def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> {
- let Latency = 26;
+def HWWriteResGroup184 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 14;
let NumMicroOps = 12;
- let ResourceCycles = [2,2,1,3,2,2];
-}
-def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm,
- VPGATHERDQrm,
- VPGATHERDDrm)>;
-
-def HWWriteResGroup185 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 24;
- let NumMicroOps = 22;
- let ResourceCycles = [5,3,4,1,5,4];
+ let ResourceCycles = [2,2,2,1,3,2];
}
-def: InstRW<[HWWriteResGroup185], (instrs VGATHERQPDYrm,
- VPGATHERQQYrm)>;
+def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm, VPGATHERDQrm)>;
-def HWWriteResGroup186 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 28;
- let NumMicroOps = 22;
- let ResourceCycles = [5,3,4,1,5,4];
-}
-def: InstRW<[HWWriteResGroup186], (instrs VPGATHERQDYrm)>;
-
-def HWWriteResGroup187 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 25;
- let NumMicroOps = 22;
- let ResourceCycles = [5,3,4,1,5,4];
+def HWWriteResGroup185 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 20;
+ let ResourceCycles = [3,3,4,1,5,4];
}
-def: InstRW<[HWWriteResGroup187], (instrs VPGATHERQDrm)>;
+def: InstRW<[HWWriteResGroup185], (instrs VGATHERDPDYrm, VPGATHERDQYrm)>;
-def HWWriteResGroup188 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 27;
+def HWWriteResGroup186 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 16;
let NumMicroOps = 20;
let ResourceCycles = [3,3,4,1,5,4];
}
-def: InstRW<[HWWriteResGroup188], (instrs VGATHERDPDYrm,
- VPGATHERDQYrm)>;
+def: InstRW<[HWWriteResGroup186], (instrs VGATHERDPSrm, VPGATHERDDrm)>;
-def HWWriteResGroup189 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 27;
+def HWWriteResGroup187 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 22;
let NumMicroOps = 34;
let ResourceCycles = [5,3,8,1,9,8];
}
-def: InstRW<[HWWriteResGroup189], (instrs VGATHERDPSYrm,
- VPGATHERDDYrm)>;
+def: InstRW<[HWWriteResGroup187], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
-def HWWriteResGroup190 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 23;
+def HWWriteResGroup188 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 15;
let NumMicroOps = 14;
let ResourceCycles = [3,3,2,1,3,2];
}
-def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPDrm,
- VPGATHERQQrm)>;
+def: InstRW<[HWWriteResGroup188], (instrs VGATHERQPDrm, VPGATHERQQrm)>;
-def HWWriteResGroup191 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 28;
- let NumMicroOps = 15;
- let ResourceCycles = [3,3,2,1,4,2];
+def HWWriteResGroup189 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 22;
+ let ResourceCycles = [5,3,4,1,5,4];
}
-def: InstRW<[HWWriteResGroup191], (instrs VGATHERQPSYrm)>;
+def: InstRW<[HWWriteResGroup189], (instrs VGATHERQPDYrm, VPGATHERQQYrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
-def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
- let Latency = 25;
+def HWWriteResGroup190 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+ let Latency = 16;
let NumMicroOps = 15;
let ResourceCycles = [3,3,2,1,4,2];
}
-def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm,
- VGATHERDPSrm)>;
+def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPSrm, VPGATHERQDrm)>;
def: InstRW<[WriteZero], (instrs CLC)>;
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
// ports in certain conditions.
// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
// section "Haswell and Broadwell Pipeline" > "Register allocation and
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 26d4d8fa3549..ac32f1b19990 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -238,6 +238,7 @@ defm : SBWriteResPair<WriteFCmp64Y, [SBPort1], 3, [1], 1, 7>;
defm : SBWriteResPair<WriteFCmp64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1
defm : SBWriteResPair<WriteFCom, [SBPort1], 3>;
+defm : SBWriteResPair<WriteFComX, [SBPort1], 3>;
defm : SBWriteResPair<WriteFMul, [SBPort0], 5, [1], 1, 6>;
defm : SBWriteResPair<WriteFMulX, [SBPort0], 5, [1], 1, 6>;
@@ -366,8 +367,10 @@ defm : X86WriteRes<WriteVecStoreX, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteVecStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteVecStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteVecStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
-defm : X86WriteRes<WriteVecMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveX, [SBPort015], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveY, [SBPort05], 1, [1], 1>;
@@ -481,7 +484,7 @@ def : WriteRes<WritePCmpEStrM, [SBPort015]> {
let ResourceCycles = [8];
}
def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
- let Latency = 11;
+ let Latency = 17;
let ResourceCycles = [7, 1];
}
@@ -503,7 +506,7 @@ def : WriteRes<WritePCmpEStrI, [SBPort015]> {
let ResourceCycles = [8];
}
def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
- let Latency = 4;
+ let Latency = 10;
let ResourceCycles = [7, 1];
}
@@ -541,7 +544,7 @@ def : WriteRes<WriteAESKeyGen, [SBPort015]> {
let ResourceCycles = [11];
}
def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
- let Latency = 8;
+ let Latency = 14;
let ResourceCycles = [10, 1];
}
@@ -551,7 +554,7 @@ def : WriteRes<WriteCLMul, [SBPort015]> {
let ResourceCycles = [18];
}
def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
- let Latency = 14;
+ let Latency = 20;
let ResourceCycles = [17, 1];
}
@@ -881,7 +884,7 @@ def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
let NumMicroOps = 3;
let ResourceCycles = [1,1,1];
}
-def: InstRW<[SBWriteResGroup64], (instrs FARJMP64)>;
+def: InstRW<[SBWriteResGroup64], (instrs FARJMP64m)>;
def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> {
let Latency = 7;
@@ -967,7 +970,7 @@ def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
let NumMicroOps = 5;
let ResourceCycles = [1,1,1,2];
}
-def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>;
+def: InstRW<[SBWriteResGroup87], (instrs FARCALL64m)>;
def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
let Latency = 9;
@@ -1105,7 +1108,7 @@ def: InstRW<[SBWriteResGroupVzeroupper], (instrs VZEROUPPER)>;
def: InstRW<[WriteZero], (instrs CLC)>;
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
// ports in certain conditions.
// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
// section "Sandy Bridge and Ivy Bridge Pipeline" > "Register allocation and
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 9a511ecc0071..0599564765da 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -255,7 +255,8 @@ defm : SKLWriteResPair<WriteFCmp64X, [SKLPort01], 4, [1], 1, 6>;
defm : SKLWriteResPair<WriteFCmp64Y, [SKLPort01], 4, [1], 1, 7>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
-defm : SKLWriteResPair<WriteFCom, [SKLPort0], 2>; // Floating point compare to flags.
+defm : SKLWriteResPair<WriteFCom, [SKLPort0], 2>; // Floating point compare to flags (X87).
+defm : SKLWriteResPair<WriteFComX, [SKLPort0], 2>; // Floating point compare to flags (SSE).
defm : SKLWriteResPair<WriteFMul, [SKLPort01], 4, [1], 1, 5>; // Floating point multiplication.
defm : SKLWriteResPair<WriteFMulX, [SKLPort01], 4, [1], 1, 6>;
@@ -342,8 +343,10 @@ defm : X86WriteRes<WriteVecStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteVecMove, [SKLPort05], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveX, [SKLPort015], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveY, [SKLPort015], 1, [1], 1>;
@@ -361,9 +364,9 @@ defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
defm : SKLWriteResPair<WriteVecTest, [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>;
defm : X86WriteResPairUnsupported<WriteVecTestZ>;
-defm : SKLWriteResPair<WriteVecIMul, [SKLPort0] , 4, [1], 1, 5>; // Vector integer multiply.
-defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01], 4, [1], 1, 6>;
-defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 4, [1], 1, 7>;
+defm : SKLWriteResPair<WriteVecIMul, [SKLPort0] , 5, [1], 1, 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01], 5, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 5, [1], 1, 7>;
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD.
defm : SKLWriteResPair<WritePMULLDY, [SKLPort01], 10, [2], 2, 7>;
@@ -1012,7 +1015,7 @@ def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKLWriteResGroup72], (instrs FARJMP64)>;
+def: InstRW<[SKLWriteResGroup72], (instrs FARJMP64m)>;
def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>;
def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> {
@@ -1193,7 +1196,7 @@ def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,
let ResourceCycles = [1,1,1,1,1];
}
def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup102], (instrs FARCALL64)>;
+def: InstRW<[SKLWriteResGroup102], (instrs FARCALL64m)>;
def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> {
let Latency = 7;
@@ -1592,33 +1595,31 @@ def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> {
}
def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>;
-def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
- let Latency = 22;
- let NumMicroOps = 5;
+def SKLWriteResGroupVEX2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 18;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[SKLWriteResGroup196_1], (instrs VGATHERDPSrm,
- VGATHERDPDrm,
- VGATHERQPDrm,
- VGATHERQPSrm,
- VPGATHERDDrm,
- VPGATHERDQrm,
- VPGATHERQDrm,
- VPGATHERQQrm)>;
+def: InstRW<[SKLWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
+ VGATHERQPDrm, VPGATHERQQrm,
+ VGATHERQPSrm, VPGATHERQDrm)>;
-def SKLWriteResGroup196_2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
- let Latency = 25;
- let NumMicroOps = 5;
- let ResourceCycles = [1,2,1,1];
+def SKLWriteResGroupVEX4 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 5; // 2 uops peform multiple loads
+ let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[SKLWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+ VGATHERDPSrm, VPGATHERDDrm,
+ VGATHERQPDYrm, VPGATHERQQYrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
+
+def SKLWriteResGroupVEX8 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,8,1,1];
}
-def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm,
- VGATHERQPDYrm,
- VGATHERQPSYrm,
- VPGATHERDDYrm,
- VPGATHERDQYrm,
- VPGATHERQDYrm,
- VPGATHERQQYrm,
- VGATHERDPDYrm)>;
+def: InstRW<[SKLWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
let Latency = 23;
@@ -1745,7 +1746,7 @@ def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>;
def: InstRW<[WriteZero], (instrs CLC)>;
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
// ports in certain conditions.
// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
// section "Skylake Pipeline" > "Register allocation and renaming".
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index a8c65435ab9b..7fc96d1eda89 100755
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -255,7 +255,8 @@ defm : SKXWriteResPair<WriteFCmp64X, [SKXPort01], 4, [1], 1, 6>;
defm : SKXWriteResPair<WriteFCmp64Y, [SKXPort01], 4, [1], 1, 7>;
defm : SKXWriteResPair<WriteFCmp64Z, [SKXPort05], 4, [1], 1, 7>;
-defm : SKXWriteResPair<WriteFCom, [SKXPort0], 2>; // Floating point compare to flags.
+defm : SKXWriteResPair<WriteFCom, [SKXPort0], 2>; // Floating point compare to flags (X87).
+defm : SKXWriteResPair<WriteFComX, [SKXPort0], 2>; // Floating point compare to flags (SSE).
defm : SKXWriteResPair<WriteFMul, [SKXPort01], 4, [1], 1, 5>; // Floating point multiplication.
defm : SKXWriteResPair<WriteFMulX, [SKXPort01], 4, [1], 1, 6>;
@@ -342,8 +343,10 @@ defm : X86WriteRes<WriteVecStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteVecStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteVecMove, [SKXPort05], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveX, [SKXPort015], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveY, [SKXPort015], 1, [1], 1>;
@@ -361,10 +364,10 @@ defm : SKXWriteResPair<WriteVecLogicZ,[SKXPort05], 1, [1], 1, 7>;
defm : SKXWriteResPair<WriteVecTest, [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
defm : SKXWriteResPair<WriteVecTestZ, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
-defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 4, [1], 1, 5>; // Vector integer multiply.
-defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01], 4, [1], 1, 6>;
-defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01], 4, [1], 1, 7>;
-defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05], 4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 5, [1], 1, 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01], 5, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01], 5, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05], 5, [1], 1, 7>;
defm : SKXWriteResPair<WritePMULLD, [SKXPort01], 10, [2], 2, 6>; // Vector PMULLD.
defm : SKXWriteResPair<WritePMULLDY, [SKXPort01], 10, [2], 2, 7>;
defm : SKXWriteResPair<WritePMULLDZ, [SKXPort05], 10, [2], 2, 7>;
@@ -619,6 +622,8 @@ def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
"KOR(B|D|Q|W)rr",
"KXNOR(B|D|Q|W)rr",
"KXOR(B|D|Q|W)rr",
+ "KSET0(B|D|Q|W)", // Same as KXOR
+ "KSET1(B|D|Q|W)", // Same as KXNOR
"MMX_PADDS(B|W)irr",
"MMX_PADDUS(B|W)irr",
"MMX_PAVG(B|W)irr",
@@ -814,19 +819,26 @@ def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> {
}
def: InstRW<[SKXWriteResGroup32], (instrs VPSADBWZrr)>; // TODO: 512-bit ops require ports 0/1 to be joined.
def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
- "KADD(B|D|Q|W)rr",
+ "VALIGND(Z|Z128|Z256)rri",
+ "VALIGNQ(Z|Z128|Z256)rri",
+ "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
+ "VPBROADCAST(B|W)rr",
+ "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr")>;
+
+def SKXWriteResGroup33 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup33], (instregex "KADD(B|D|Q|W)rr",
"KSHIFTL(B|D|Q|W)ri",
"KSHIFTR(B|D|Q|W)ri",
"KUNPCK(BW|DQ|WD)rr",
- "VALIGND(Z|Z128|Z256)rri",
- "VALIGNQ(Z|Z128|Z256)rri",
"VCMPPD(Z|Z128|Z256)rri",
"VCMPPS(Z|Z128|Z256)rri",
"VCMP(SD|SS)Zrr",
- "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
"VFPCLASS(PD|PS)(Z|Z128|Z256)rr",
"VFPCLASS(SD|SS)Zrr",
- "VPBROADCAST(B|W)rr",
"VPCMPB(Z|Z128|Z256)rri",
"VPCMPD(Z|Z128|Z256)rri",
"VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
@@ -834,7 +846,6 @@ def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0
"VPCMPQ(Z|Z128|Z256)rri",
"VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
"VPCMPW(Z|Z128|Z256)rri",
- "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr",
"VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>;
def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> {
@@ -1171,7 +1182,7 @@ def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> {
let NumMicroOps = 2;
let ResourceCycles = [1,1];
}
-def: InstRW<[SKXWriteResGroup76], (instrs FARJMP64)>;
+def: InstRW<[SKXWriteResGroup76], (instrs FARJMP64m)>;
def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>;
def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> {
@@ -1331,8 +1342,8 @@ def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
(instregex "VBLENDMPDZ128rm(b?)",
"VBLENDMPSZ128rm(b?)",
- "VBROADCASTI32X2Z128m(b?)",
- "VBROADCASTSSZ128m(b?)",
+ "VBROADCASTI32X2Z128rm(b?)",
+ "VBROADCASTSSZ128rm(b?)",
"VINSERT(F|I)128rm",
"VMOVAPDZ128rm(b?)",
"VMOVAPSZ128rm(b?)",
@@ -1350,8 +1361,8 @@ def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
"VPADD(B|D|Q|W)Z128rm(b?)",
"(V?)PADD(B|D|Q|W)rm",
"VPBLENDM(B|D|Q|W)Z128rm(b?)",
- "VPBROADCASTDZ128m(b?)",
- "VPBROADCASTQZ128m(b?)",
+ "VPBROADCASTDZ128rm(b?)",
+ "VPBROADCASTQZ128rm(b?)",
"VPSUB(B|D|Q|W)Z128rm(b?)",
"(V?)PSUB(B|D|Q|W)rm",
"VPTERNLOGDZ128rm(b?)i",
@@ -1456,7 +1467,7 @@ def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,
let ResourceCycles = [1,1,1,1,1];
}
def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup109], (instrs FARCALL64)>;
+def: InstRW<[SKXWriteResGroup109], (instrs FARCALL64m)>;
def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
let Latency = 7;
@@ -1516,9 +1527,8 @@ def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> {
let ResourceCycles = [1,1];
}
def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)(32|64)m",
- "VFPCLASSSDZrm(b?)",
- "VPBROADCASTB(Z|Z256)m(b?)",
- "VPBROADCASTW(Z|Z256)m(b?)")>;
+ "VPBROADCASTB(Z|Z256)rm(b?)",
+ "VPBROADCASTW(Z|Z256)rm(b?)")>;
def: InstRW<[SKXWriteResGroup119], (instrs VPBROADCASTBYrm,
VPBROADCASTWYrm,
VPMOVSXBDYrm,
@@ -1535,24 +1545,24 @@ def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
(instregex "VBLENDMPD(Z|Z256)rm(b?)",
"VBLENDMPS(Z|Z256)rm(b?)",
- "VBROADCASTF32X2Z256m(b?)",
- "VBROADCASTF32X2Zm(b?)",
+ "VBROADCASTF32X2Z256rm(b?)",
+ "VBROADCASTF32X2Zrm(b?)",
"VBROADCASTF32X4Z256rm(b?)",
"VBROADCASTF32X4rm(b?)",
"VBROADCASTF32X8rm(b?)",
"VBROADCASTF64X2Z128rm(b?)",
"VBROADCASTF64X2rm(b?)",
"VBROADCASTF64X4rm(b?)",
- "VBROADCASTI32X2Z256m(b?)",
- "VBROADCASTI32X2Zm(b?)",
+ "VBROADCASTI32X2Z256rm(b?)",
+ "VBROADCASTI32X2Zrm(b?)",
"VBROADCASTI32X4Z256rm(b?)",
"VBROADCASTI32X4rm(b?)",
"VBROADCASTI32X8rm(b?)",
"VBROADCASTI64X2Z128rm(b?)",
"VBROADCASTI64X2rm(b?)",
"VBROADCASTI64X4rm(b?)",
- "VBROADCASTSD(Z|Z256)m(b?)",
- "VBROADCASTSS(Z|Z256)m(b?)",
+ "VBROADCASTSD(Z|Z256)rm(b?)",
+ "VBROADCASTSS(Z|Z256)rm(b?)",
"VINSERTF32x4(Z|Z256)rm(b?)",
"VINSERTF32x8Zrm(b?)",
"VINSERTF64x2(Z|Z256)rm(b?)",
@@ -1577,8 +1587,8 @@ def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
"VPADD(B|D|Q|W)Yrm",
"VPADD(B|D|Q|W)(Z|Z256)rm(b?)",
"VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)",
- "VPBROADCASTD(Z|Z256)m(b?)",
- "VPBROADCASTQ(Z|Z256)m(b?)",
+ "VPBROADCASTD(Z|Z256)rm(b?)",
+ "VPBROADCASTQ(Z|Z256)rm(b?)",
"VPSUB(B|D|Q|W)Yrm",
"VPSUB(B|D|Q|W)(Z|Z256)rm(b?)",
"VPTERNLOGD(Z|Z256)rm(b?)i",
@@ -1667,17 +1677,9 @@ def: InstRW<[SKXWriteResGroup136], (instrs VPMOVSXBWYrm,
VPMOVSXWDYrm,
VPMOVZXWDYrm)>;
def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
- "VCMP(PD|PS)Z128rm(b?)i",
- "VCMP(SD|SS)Zrm",
+ "VFPCLASSSDZrm(b?)",
"VFPCLASSSSZrm(b?)",
- "VPCMPBZ128rmi(b?)",
- "VPCMPDZ128rmi(b?)",
- "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
- "VPCMPGT(B|D|Q|W)Z128rm(b?)",
"(V?)PCMPGTQrm",
- "VPCMPQZ128rmi(b?)",
- "VPCMPU(B|D|Q|W)Z128rmi(b?)",
- "VPCMPWZ128rmi(b?)",
"VPERMI2D128rm(b?)",
"VPERMI2PD128rm(b?)",
"VPERMI2PS128rm(b?)",
@@ -1701,15 +1703,32 @@ def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
"VPMOVZXBWZ128rm(b?)",
"VPMOVZXDQZ128rm(b?)",
"VPMOVZXWDZ128rm(b?)",
- "VPMOVZXWQZ128rm(b?)",
- "VPTESTMBZ128rm(b?)",
- "VPTESTMDZ128rm(b?)",
- "VPTESTMQZ128rm(b?)",
- "VPTESTMWZ128rm(b?)",
- "VPTESTNMBZ128rm(b?)",
- "VPTESTNMDZ128rm(b?)",
- "VPTESTNMQZ128rm(b?)",
- "VPTESTNMWZ128rm(b?)")>;
+ "VPMOVZXWQZ128rm(b?)")>;
+
+def SKXWriteResGroup136_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup136_2], (instregex "VCMP(PD|PS)Z128rm(b?)i",
+ "VCMP(SD|SS)Zrm",
+ "VFPCLASSPDZ128rm(b?)",
+ "VFPCLASSPSZ128rm(b?)",
+ "VPCMPBZ128rmi(b?)",
+ "VPCMPDZ128rmi(b?)",
+ "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
+ "VPCMPGT(B|D|Q|W)Z128rm(b?)",
+ "VPCMPQZ128rmi(b?)",
+ "VPCMPU(B|D|Q|W)Z128rmi(b?)",
+ "VPCMPWZ128rmi(b?)",
+ "VPTESTMBZ128rm(b?)",
+ "VPTESTMDZ128rm(b?)",
+ "VPTESTMQZ128rm(b?)",
+ "VPTESTMWZ128rm(b?)",
+ "VPTESTNMBZ128rm(b?)",
+ "VPTESTNMDZ128rm(b?)",
+ "VPTESTNMQZ128rm(b?)",
+ "VPTESTNMWZ128rm(b?)")>;
def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 9;
@@ -1745,30 +1764,38 @@ def: InstRW<[SKXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
"ILD_F(16|32|64)m",
"VALIGND(Z|Z256)rm(b?)i",
"VALIGNQ(Z|Z256)rm(b?)i",
- "VCMPPD(Z|Z256)rm(b?)i",
- "VCMPPS(Z|Z256)rm(b?)i",
- "VPCMPB(Z|Z256)rmi(b?)",
- "VPCMPD(Z|Z256)rmi(b?)",
- "VPCMPEQB(Z|Z256)rm(b?)",
- "VPCMPEQD(Z|Z256)rm(b?)",
- "VPCMPEQQ(Z|Z256)rm(b?)",
- "VPCMPEQW(Z|Z256)rm(b?)",
- "VPCMPGTB(Z|Z256)rm(b?)",
- "VPCMPGTD(Z|Z256)rm(b?)",
- "VPCMPGTQ(Z|Z256)rm(b?)",
- "VPCMPGTW(Z|Z256)rm(b?)",
- "VPCMPQ(Z|Z256)rmi(b?)",
- "VPCMPU(B|D|Q|W)Z256rmi(b?)",
- "VPCMPU(B|D|Q|W)Zrmi(b?)",
- "VPCMPW(Z|Z256)rmi(b?)",
"VPMAXSQ(Z|Z256)rm(b?)",
"VPMAXUQ(Z|Z256)rm(b?)",
"VPMINSQ(Z|Z256)rm(b?)",
- "VPMINUQ(Z|Z256)rm(b?)",
- "VPTESTM(B|D|Q|W)Z256rm(b?)",
- "VPTESTM(B|D|Q|W)Zrm(b?)",
- "VPTESTNM(B|D|Q|W)Z256rm(b?)",
- "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
+ "VPMINUQ(Z|Z256)rm(b?)")>;
+
+def SKXWriteResGroup148_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
+ "VCMPPS(Z|Z256)rm(b?)i",
+ "VFPCLASSPD(Z|Z256)rm(b?)",
+ "VFPCLASSPS(Z|Z256)rm(b?)",
+ "VPCMPB(Z|Z256)rmi(b?)",
+ "VPCMPD(Z|Z256)rmi(b?)",
+ "VPCMPEQB(Z|Z256)rm(b?)",
+ "VPCMPEQD(Z|Z256)rm(b?)",
+ "VPCMPEQQ(Z|Z256)rm(b?)",
+ "VPCMPEQW(Z|Z256)rm(b?)",
+ "VPCMPGTB(Z|Z256)rm(b?)",
+ "VPCMPGTD(Z|Z256)rm(b?)",
+ "VPCMPGTQ(Z|Z256)rm(b?)",
+ "VPCMPGTW(Z|Z256)rm(b?)",
+ "VPCMPQ(Z|Z256)rmi(b?)",
+ "VPCMPU(B|D|Q|W)Z256rmi(b?)",
+ "VPCMPU(B|D|Q|W)Zrmi(b?)",
+ "VPCMPW(Z|Z256)rmi(b?)",
+ "VPTESTM(B|D|Q|W)Z256rm(b?)",
+ "VPTESTM(B|D|Q|W)Zrm(b?)",
+ "VPTESTNM(B|D|Q|W)Z256rm(b?)",
+ "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> {
let Latency = 10;
@@ -1938,14 +1965,14 @@ def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
def: InstRW<[SKXWriteResGroup171], (instrs LOOPE, LOOPNE)>;
def SKXWriteResGroup174 : SchedWriteRes<[SKXPort01]> {
- let Latency = 12;
+ let Latency = 15;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>;
def SKXWriteResGroup174z : SchedWriteRes<[SKXPort05]> {
- let Latency = 12;
+ let Latency = 15;
let NumMicroOps = 3;
let ResourceCycles = [3];
}
@@ -2106,8 +2133,8 @@ def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKX
}
def: InstRW<[SKXWriteResGroup202], (instrs XCH_F)>;
-def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> {
- let Latency = 18;
+def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort01]> {
+ let Latency = 21;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
@@ -2134,21 +2161,19 @@ def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
}
def : SchedAlias<WriteFDiv64Ld, SKXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair
-def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> {
- let Latency = 19;
+def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort01]> {
+ let Latency = 22;
let NumMicroOps = 4;
let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)",
- "VPMULLQZrm(b?)")>;
+def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)")>;
-def SKXWriteResGroup214 : SchedWriteRes<[]> {
- let Latency = 20;
- let NumMicroOps = 0;
+def SKXWriteResGroup211_1 : SchedWriteRes<[SKXPort23,SKXPort05]> {
+ let Latency = 22;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
}
-def: InstRW<[SKXWriteResGroup214], (instrs VGATHERDPSZ128rm,
- VGATHERQPSZrm,
- VPGATHERDDZ128rm)>;
+def: InstRW<[SKXWriteResGroup211_1], (instregex "VPMULLQZrm(b?)")>;
def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> {
let Latency = 20;
@@ -2164,15 +2189,41 @@ def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
}
def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair
-def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
- let Latency = 20;
- let NumMicroOps = 5;
+def SKXWriteGatherEVEX2 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[SKXWriteResGroup218], (instrs VGATHERQPSZ128rm,
- VGATHERQPSZ256rm,
- VPGATHERQDZ128rm,
- VPGATHERQDZ256rm)>;
+def: InstRW<[SKXWriteGatherEVEX2], (instrs VGATHERQPSZ128rm, VPGATHERQDZ128rm,
+ VGATHERDPDZ128rm, VPGATHERDQZ128rm,
+ VGATHERQPDZ128rm, VPGATHERQQZ128rm)>;
+
+def SKXWriteGatherEVEX4 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX4], (instrs VGATHERQPSZ256rm, VPGATHERQDZ256rm,
+ VGATHERQPDZ256rm, VPGATHERQQZ256rm,
+ VGATHERDPSZ128rm, VPGATHERDDZ128rm,
+ VGATHERDPDZ256rm, VPGATHERDQZ256rm)>;
+
+def SKXWriteGatherEVEX8 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 21;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,8,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX8], (instrs VGATHERDPSZ256rm, VPGATHERDDZ256rm,
+ VGATHERDPDZrm, VPGATHERDQZrm,
+ VGATHERQPDZrm, VPGATHERQQZrm,
+ VGATHERQPSZrm, VPGATHERQDZrm)>;
+
+def SKXWriteGatherEVEX16 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,16,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX16], (instrs VGATHERDPSZrm, VPGATHERDDZrm)>;
def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
let Latency = 20;
@@ -2202,57 +2253,31 @@ def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>;
-def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
- let Latency = 22;
- let NumMicroOps = 5;
+def SKXWriteResGroupVEX2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 18;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
let ResourceCycles = [1,2,1,1];
}
-def: InstRW<[SKXWriteResGroup224], (instrs VGATHERDPDZ128rm,
- VGATHERQPDZ128rm,
- VPGATHERDQZ128rm,
- VPGATHERQQZ128rm)>;
+def: InstRW<[SKXWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
+ VGATHERQPDrm, VPGATHERQQrm,
+ VGATHERQPSrm, VPGATHERQDrm)>;
-def SKXWriteResGroup224_2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
- let Latency = 22;
- let NumMicroOps = 5;
- let ResourceCycles = [1,2,1,1];
+def SKXWriteResGroupVEX4 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 5; // 2 uops peform multiple loads
+ let ResourceCycles = [1,4,1,1];
}
-def: InstRW<[SKXWriteResGroup224_2], (instrs VGATHERDPSrm,
- VGATHERDPDrm,
- VGATHERQPDrm,
- VGATHERQPSrm,
- VPGATHERDDrm,
- VPGATHERDQrm,
- VPGATHERQDrm,
- VPGATHERQQrm,
- VPGATHERDDrm,
- VPGATHERQDrm,
- VPGATHERDQrm,
- VPGATHERQQrm,
- VGATHERDPSrm,
- VGATHERQPSrm,
- VGATHERDPDrm,
- VGATHERQPDrm)>;
-
-def SKXWriteResGroup224_3 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
- let Latency = 25;
- let NumMicroOps = 5;
- let ResourceCycles = [1,2,1,1];
+def: InstRW<[SKXWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+ VGATHERDPSrm, VPGATHERDDrm,
+ VGATHERQPDYrm, VPGATHERQQYrm,
+ VGATHERQPSYrm, VPGATHERQDYrm)>;
+
+def SKXWriteResGroupVEX8 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 5; // 2 uops perform multiple loads
+ let ResourceCycles = [1,8,1,1];
}
-def: InstRW<[SKXWriteResGroup224_3], (instrs VGATHERDPSYrm,
- VGATHERQPDYrm,
- VGATHERQPSYrm,
- VPGATHERDDYrm,
- VPGATHERDQYrm,
- VPGATHERQDYrm,
- VPGATHERQQYrm,
- VPGATHERDDYrm,
- VPGATHERQDYrm,
- VPGATHERDQYrm,
- VPGATHERQQYrm,
- VGATHERDPSYrm,
- VGATHERQPSYrm,
- VGATHERDPDYrm)>;
+def: InstRW<[SKXWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
let Latency = 22;
@@ -2276,27 +2301,6 @@ def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>;
-def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
- let Latency = 25;
- let NumMicroOps = 5;
- let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm,
- VGATHERQPDZ256rm,
- VPGATHERDQZ256rm,
- VPGATHERQDZrm,
- VPGATHERQQZ256rm)>;
-
-def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
- let Latency = 26;
- let NumMicroOps = 5;
- let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup238], (instrs VGATHERDPDZrm,
- VGATHERQPDZrm,
- VPGATHERDQZrm,
- VPGATHERQQZrm)>;
-
def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
let Latency = 27;
let NumMicroOps = 2;
@@ -2304,14 +2308,6 @@ def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>;
-def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
- let Latency = 27;
- let NumMicroOps = 5;
- let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup240], (instrs VGATHERDPSZ256rm,
- VPGATHERDDZ256rm)>;
-
def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
let Latency = 29;
let NumMicroOps = 15;
@@ -2326,14 +2322,6 @@ def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
}
def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>;
-def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
- let Latency = 30;
- let NumMicroOps = 5;
- let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm,
- VPGATHERDDZrm)>;
-
def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> {
let Latency = 35;
let NumMicroOps = 23;
@@ -2461,7 +2449,7 @@ def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>;
def: InstRW<[WriteZero], (instrs CLC)>;
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
// ports in certain conditions.
// See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
// section "Skylake Pipeline" > "Register allocation and renaming".
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td b/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td
index 95f710061aeb..f204d6622119 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Schedule.td
@@ -250,7 +250,8 @@ defm WriteFCmp64 : X86SchedWritePair<ReadAfterVecLd>; // Floating point double
defm WriteFCmp64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double compare (XMM).
defm WriteFCmp64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (YMM).
defm WriteFCmp64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (ZMM).
-defm WriteFCom : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags.
+defm WriteFCom : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags (X87).
+defm WriteFComX : X86SchedWritePair<ReadAfterVecLd>; // Floating point compare to flags (SSE).
defm WriteFMul : X86SchedWritePair<ReadAfterVecLd>; // Floating point multiplication.
defm WriteFMulX : X86SchedWritePair<ReadAfterVecXLd>; // Floating point multiplication (XMM).
defm WriteFMulY : X86SchedWritePair<ReadAfterVecYLd>; // Floating point multiplication (YMM).
@@ -340,8 +341,10 @@ def WriteVecStoreX : SchedWrite;
def WriteVecStoreY : SchedWrite;
def WriteVecStoreNT : SchedWrite;
def WriteVecStoreNTY : SchedWrite;
-def WriteVecMaskedStore : SchedWrite;
-def WriteVecMaskedStoreY : SchedWrite;
+def WriteVecMaskedStore32 : SchedWrite;
+def WriteVecMaskedStore64 : SchedWrite;
+def WriteVecMaskedStore32Y : SchedWrite;
+def WriteVecMaskedStore64Y : SchedWrite;
def WriteVecMove : SchedWrite;
def WriteVecMoveX : SchedWrite;
def WriteVecMoveY : SchedWrite;
@@ -549,6 +552,14 @@ def WriteFMaskMove32Y
: X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>;
def WriteFMaskMove64Y
: X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>;
+def WriteVecMaskMove32
+ : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore32>;
+def WriteVecMaskMove64
+ : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore64>;
+def WriteVecMaskMove32Y
+ : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore32Y>;
+def WriteVecMaskMove64Y
+ : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore64Y>;
// Vector width wrappers.
def SchedWriteFAdd
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
index b0153ca9da36..b90baf6c16b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -244,6 +244,7 @@ defm : AtomWriteResPair<WriteFCmp64X, [AtomPort01], [AtomPort01], 6, 7,
defm : X86WriteResPairUnsupported<WriteFCmp64Y>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : AtomWriteResPair<WriteFCom, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
+defm : AtomWriteResPair<WriteFComX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
defm : AtomWriteResPair<WriteFMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>;
defm : AtomWriteResPair<WriteFMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>;
defm : X86WriteResPairUnsupported<WriteFMulY>;
@@ -368,8 +369,10 @@ def : WriteRes<WriteVecStoreX, [AtomPort0]>;
defm : X86WriteResUnsupported<WriteVecStoreY>;
def : WriteRes<WriteVecStoreNT, [AtomPort0]>;
defm : X86WriteResUnsupported<WriteVecStoreNTY>;
-def : WriteRes<WriteVecMaskedStore, [AtomPort0]>;
-defm : X86WriteResUnsupported<WriteVecMaskedStoreY>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
def : WriteRes<WriteVecMove, [AtomPort0]>;
def : WriteRes<WriteVecMoveX, [AtomPort01]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index d7aea3cf4e9d..0a201bc74a48 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -545,8 +545,40 @@ def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
// This is for simple LEAs with one or two input operands.
-// FIXME: SAGU 3-operand LEA
-def : WriteRes<WriteLEA, [PdEX01]> { let NumMicroOps = 2; }
+def : WriteRes<WriteLEA, [PdEX01]> { let ResourceCycles = [2]; }
+
+// This write is used for slow LEA instructions.
+def PdWrite3OpsLEA : SchedWriteRes<[PdEX01]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset),
+// or an LEA with a `Scale` value different than 1.
+def PdSlowLEAPredicate : MCSchedPredicate<
+ CheckAny<[
+ // A 3-operand LEA (base, index, offset).
+ IsThreeOperandsLEAFn,
+ // An LEA with a "Scale" different than 1.
+ CheckAll<[
+ CheckIsImmOperand<2>,
+ CheckNot<CheckImmOperand<2, 1>>
+ ]>
+ ]>
+>;
+
+def PdWriteLEA : SchedWriteVariant<[
+ SchedVar<PdSlowLEAPredicate, [PdWrite3OpsLEA]>,
+ SchedVar<NoSchedPred, [WriteLEA]>
+]>;
+
+def : InstRW<[PdWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def PdWriteLEA16r : SchedWriteRes<[PdEX01]> {
+ let ResourceCycles = [3];
+ let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteLEA16r], (instrs LEA16r)>;
// Bit counts.
defm : PdWriteResExPair<WriteBSF, [PdEX01], 3, [6], 6, 2>;
@@ -766,6 +798,7 @@ defm : PdWriteResYMMPair<WriteFCmp64Y, [PdFPU0, PdFPFMA], 2, [1, 2]>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : PdWriteResXMMPair<WriteFCom, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+defm : PdWriteResXMMPair<WriteFComX, [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
let Latency = 6;
@@ -1060,8 +1093,10 @@ def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
defm : PdWriteRes<WriteVecStoreNT, [PdStore, PdFPU1, PdFPSTO], 2>;
defm : PdWriteRes<WriteVecStoreNTY, [PdStore, PdFPU1, PdFPSTO], 2, [2, 2, 2], 4>;
-defm : PdWriteRes<WriteVecMaskedStore, [PdStore, PdFPU01, PdFPMAL], 6, [1, 1, 4]>;
-defm : PdWriteRes<WriteVecMaskedStoreY, [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
defm : PdWriteRes<WriteVecMove, [PdFPU01, PdFPMAL], 2>;
defm : PdWriteRes<WriteVecMoveX, [PdFPU01, PdFPMAL], 1, [1, 2]>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index d0421d94ee05..13b6eed5126d 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -541,6 +541,7 @@ defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>;
defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>;
+defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0], 3>;
defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>;
defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>;
defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>;
@@ -669,8 +670,10 @@ defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1],
defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [2, 2, 2], 2>;
defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>;
defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>;
-defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>;
defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
index dcd155ea0e0e..3d53ef104ed6 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -214,6 +214,7 @@ defm : SLMWriteResPair<WriteFCmp64X, [SLM_FPC_RSV1], 3>;
defm : SLMWriteResPair<WriteFCmp64Y, [SLM_FPC_RSV1], 3>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : SLMWriteResPair<WriteFCom, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFComX, [SLM_FPC_RSV1], 3>;
defm : SLMWriteResPair<WriteFMul, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
defm : SLMWriteResPair<WriteFMulX, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
defm : SLMWriteResPair<WriteFMulY, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
@@ -310,8 +311,10 @@ def : WriteRes<WriteVecStoreX, [SLM_MEC_RSV]>;
def : WriteRes<WriteVecStoreY, [SLM_MEC_RSV]>;
def : WriteRes<WriteVecStoreNT, [SLM_MEC_RSV]>;
def : WriteRes<WriteVecStoreNTY, [SLM_MEC_RSV]>;
-def : WriteRes<WriteVecMaskedStore, [SLM_MEC_RSV]>;
-def : WriteRes<WriteVecMaskedStoreY, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore32, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore32Y, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore64, [SLM_MEC_RSV]>;
+def : WriteRes<WriteVecMaskedStore64Y, [SLM_MEC_RSV]>;
def : WriteRes<WriteVecMove, [SLM_FPC_RSV01]>;
def : WriteRes<WriteVecMoveX, [SLM_FPC_RSV01]>;
def : WriteRes<WriteVecMoveY, [SLM_FPC_RSV01]>;
@@ -390,44 +393,15 @@ defm : X86WriteResPairUnsupported<WritePHAddZ>;
// String instructions.
// Packed Compare Implicit Length Strings, Return Mask
-def : WriteRes<WritePCmpIStrM, [SLM_FPC_RSV0]> {
- let Latency = 13;
- let ResourceCycles = [13];
-}
-def : WriteRes<WritePCmpIStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 13;
- let ResourceCycles = [13, 1];
-}
+defm : SLMWriteResPair<WritePCmpIStrM, [SLM_FPC_RSV0], 13, [13]>;
// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [SLM_FPC_RSV0]> {
- let Latency = 17;
- let ResourceCycles = [17];
-}
-def : WriteRes<WritePCmpEStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 17;
- let ResourceCycles = [17, 1];
-}
-
+defm : SLMWriteResPair<WritePCmpEStrM, [SLM_FPC_RSV0], 17, [17]>;
// Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [SLM_FPC_RSV0]> {
- let Latency = 17;
- let ResourceCycles = [17];
-}
-def : WriteRes<WritePCmpIStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 17;
- let ResourceCycles = [17, 1];
-}
+defm : SLMWriteResPair<WritePCmpIStrI, [SLM_FPC_RSV0], 17, [17]>;
// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [SLM_FPC_RSV0]> {
- let Latency = 21;
- let ResourceCycles = [21];
-}
-def : WriteRes<WritePCmpEStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 21;
- let ResourceCycles = [21, 1];
-}
+defm : SLMWriteResPair<WritePCmpEStrI, [SLM_FPC_RSV0], 21, [21]>;
// MOVMSK Instructions.
def : WriteRes<WriteFMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
@@ -436,42 +410,12 @@ def : WriteRes<WriteVecMOVMSKY, [SLM_FPC_RSV1]> { let Latency = 4; }
def : WriteRes<WriteMMXMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; }
// AES Instructions.
-def : WriteRes<WriteAESDecEnc, [SLM_FPC_RSV0]> {
- let Latency = 8;
- let ResourceCycles = [5];
-}
-def : WriteRes<WriteAESDecEncLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 8;
- let ResourceCycles = [5, 1];
-}
-
-def : WriteRes<WriteAESIMC, [SLM_FPC_RSV0]> {
- let Latency = 8;
- let ResourceCycles = [5];
-}
-def : WriteRes<WriteAESIMCLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 8;
- let ResourceCycles = [5, 1];
-}
-
-def : WriteRes<WriteAESKeyGen, [SLM_FPC_RSV0]> {
- let Latency = 8;
- let ResourceCycles = [5];
-}
-def : WriteRes<WriteAESKeyGenLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 8;
- let ResourceCycles = [5, 1];
-}
+defm : SLMWriteResPair<WriteAESDecEnc, [SLM_FPC_RSV0], 8, [5]>;
+defm : SLMWriteResPair<WriteAESIMC, [SLM_FPC_RSV0], 8, [5]>;
+defm : SLMWriteResPair<WriteAESKeyGen, [SLM_FPC_RSV0], 8, [5]>;
// Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [SLM_FPC_RSV0]> {
- let Latency = 10;
- let ResourceCycles = [10];
-}
-def : WriteRes<WriteCLMulLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
- let Latency = 10;
- let ResourceCycles = [10, 1];
-}
+defm : SLMWriteResPair<WriteCLMul, [SLM_FPC_RSV0], 10, [10]>;
def : WriteRes<WriteSystem, [SLM_FPC_RSV0]> { let Latency = 100; }
def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; }
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 06201f4a3a84..fe09d6f85221 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -295,6 +295,7 @@ defm : ZnWriteResFpuPair<WriteFCmp64X, [ZnFPU0], 3>;
defm : ZnWriteResFpuPair<WriteFCmp64Y, [ZnFPU0], 3>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : ZnWriteResFpuPair<WriteFCom, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFComX, [ZnFPU0], 3>;
defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>;
defm : ZnWriteResFpuPair<WriteFBlendY, [ZnFPU01], 1>;
defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -387,8 +388,10 @@ defm : X86WriteRes<WriteVecStoreX, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteVecStoreY, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteVecStoreNT, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteVecStoreNTY, [ZnAGU], 1, [1], 1>;
-defm : X86WriteRes<WriteVecMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
defm : X86WriteRes<WriteVecMove, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveX, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveY, [ZnFPU], 2, [1], 2>;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 4537d9cc7956..48da0d6329b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -187,7 +187,7 @@ defm : Zn2WriteResPair<WriteIMul8, [Zn2ALU1, Zn2Multiplier], 4>;
defm : X86WriteRes<WriteBSWAP32, [Zn2ALU], 1, [4], 1>;
defm : X86WriteRes<WriteBSWAP64, [Zn2ALU], 1, [4], 1>;
-defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 3, [1], 1>;
defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>;
defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>;
@@ -216,7 +216,7 @@ defm : X86WriteRes<WriteBitTestSet, [Zn2ALU], 2, [1], 2>;
// Bit counts.
defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>;
-defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 3>;
+defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4>;
defm : Zn2WriteResPair<WriteLZCNT, [Zn2ALU], 1>;
defm : Zn2WriteResPair<WriteTZCNT, [Zn2ALU], 2>;
defm : Zn2WriteResPair<WritePOPCNT, [Zn2ALU], 1>;
@@ -272,15 +272,16 @@ defm : Zn2WriteResFpuPair<WriteFAdd64, [Zn2FPU0], 3>;
defm : Zn2WriteResFpuPair<WriteFAdd64X, [Zn2FPU0], 3>;
defm : Zn2WriteResFpuPair<WriteFAdd64Y, [Zn2FPU0], 3>;
defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 1>;
defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 3>;
-defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 1>;
defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
defm : Zn2WriteResFpuPair<WriteFCom, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFComX, [Zn2FPU0], 3>;
defm : Zn2WriteResFpuPair<WriteFBlend, [Zn2FPU01], 1>;
defm : Zn2WriteResFpuPair<WriteFBlendY, [Zn2FPU01], 1>;
defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -313,8 +314,8 @@ defm : Zn2WriteResFpuPair<WriteFDiv64, [Zn2FPU3], 15>;
defm : Zn2WriteResFpuPair<WriteFDiv64X, [Zn2FPU3], 15>;
defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
defm : Zn2WriteResFpuPair<WriteFSign, [Zn2FPU3], 2>;
-defm : Zn2WriteResFpuPair<WriteFRnd, [Zn2FPU3], 4, [1], 1, 7, 0>;
-defm : Zn2WriteResFpuPair<WriteFRndY, [Zn2FPU3], 4, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRnd, [Zn2FPU3], 3, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRndY, [Zn2FPU3], 3, [1], 1, 7, 0>;
defm : X86WriteResPairUnsupported<WriteFRndZ>;
defm : Zn2WriteResFpuPair<WriteFLogic, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteFLogicY, [Zn2FPU], 1>;
@@ -325,16 +326,16 @@ defm : X86WriteResPairUnsupported<WriteFTestZ>;
defm : Zn2WriteResFpuPair<WriteFShuffle, [Zn2FPU12], 1>;
defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>;
defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
-defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 1>;
-defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 3>;
defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
defm : Zn2WriteResFpuPair<WriteFMul, [Zn2FPU01], 3, [1], 1, 7, 1>;
defm : Zn2WriteResFpuPair<WriteFMulX, [Zn2FPU01], 3, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteFMulY, [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMulY, [Zn2FPU01], 3, [1], 1, 7, 1>;
defm : X86WriteResPairUnsupported<WriteFMulZ>;
defm : Zn2WriteResFpuPair<WriteFMul64, [Zn2FPU01], 3, [1], 1, 7, 1>;
defm : Zn2WriteResFpuPair<WriteFMul64X, [Zn2FPU01], 3, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteFMul64Y, [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMul64Y, [Zn2FPU01], 3, [1], 1, 7, 1>;
defm : X86WriteResPairUnsupported<WriteFMul64Z>;
defm : Zn2WriteResFpuPair<WriteFMA, [Zn2FPU03], 5>;
defm : Zn2WriteResFpuPair<WriteFMAX, [Zn2FPU03], 5>;
@@ -369,8 +370,10 @@ defm : X86WriteRes<WriteVecStoreX, [Zn2AGU], 1, [1], 1>;
defm : X86WriteRes<WriteVecStoreY, [Zn2AGU], 1, [1], 1>;
defm : X86WriteRes<WriteVecStoreNT, [Zn2AGU], 1, [1], 1>;
defm : X86WriteRes<WriteVecStoreNTY, [Zn2AGU], 1, [1], 1>;
-defm : X86WriteRes<WriteVecMaskedStore, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [Zn2AGU,Zn2FPU01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
defm : X86WriteRes<WriteVecMove, [Zn2FPU], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveX, [Zn2FPU], 1, [1], 1>;
defm : X86WriteRes<WriteVecMoveY, [Zn2FPU], 2, [1], 2>;
@@ -380,7 +383,7 @@ defm : X86WriteRes<WriteEMMS, [Zn2FPU], 2, [1], 1>;
defm : Zn2WriteResFpuPair<WriteVecShift, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteVecShiftX, [Zn2FPU2], 1>;
-defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 2>;
+defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 1>;
defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
defm : Zn2WriteResFpuPair<WriteVecShiftImm, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>;
@@ -402,7 +405,7 @@ defm : Zn2WriteResFpuPair<WriteVecIMulX, [Zn2FPU0], 4>;
defm : Zn2WriteResFpuPair<WriteVecIMulY, [Zn2FPU0], 4>;
defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
defm : Zn2WriteResFpuPair<WritePMULLD, [Zn2FPU0], 4, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WritePMULLDY, [Zn2FPU0], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WritePMULLDY, [Zn2FPU0], 4, [1], 1, 7, 1>;
defm : X86WriteResPairUnsupported<WritePMULLDZ>;
defm : Zn2WriteResFpuPair<WriteShuffle, [Zn2FPU], 1>;
defm : Zn2WriteResFpuPair<WriteShuffleX, [Zn2FPU], 1>;
@@ -424,8 +427,8 @@ defm : X86WriteResPairUnsupported<WritePSADBWZ>;
defm : Zn2WriteResFpuPair<WritePHMINPOS, [Zn2FPU0], 4>;
// Vector Shift Operations
-defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 1>;
-defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 3>;
defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
// Vector insert/extract operations.
@@ -469,6 +472,12 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
def Zn2WriteMicrocoded : SchedWriteRes<[]> {
let Latency = 100;
}
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
+defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResPair<WritePHAddY, [], 3>;
def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
@@ -517,14 +526,14 @@ def Zn2WriteXCHG : SchedWriteRes<[Zn2ALU]> {
let NumMicroOps = 2;
}
-def : InstRW<[Zn2WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+def : InstRW<[Zn2WriteXCHG], (instregex "^XCHG(8|16|32|64)rr", "^XCHG(16|32|64)ar")>;
// r,m.
def Zn2WriteXCHGrm : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
let Latency = 5;
let NumMicroOps = 2;
}
-def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
+def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "^XCHG(8|16|32|64)rm")>;
def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
@@ -594,8 +603,11 @@ def : InstRW<[WriteALULd],
def Zn2WriteMul16 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
let Latency = 3;
}
+def Zn2WriteMul16Imm : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 4;
+}
def : SchedAlias<WriteIMul16, Zn2WriteMul16>;
-def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16>;
+def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16Imm>;
def : SchedAlias<WriteIMul16Reg, Zn2WriteMul16>;
// m16.
@@ -1001,6 +1013,7 @@ def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
// mm <- mm.
def Zn2WriteFPU12 : SchedWriteRes<[Zn2FPU12]> ;
def Zn2WriteFPU12Y : SchedWriteRes<[Zn2FPU12]> {
+ let Latency = 4;
let NumMicroOps = 2;
}
def Zn2WriteFPU12m : SchedWriteRes<[Zn2AGU, Zn2FPU12]> ;
@@ -1109,15 +1122,6 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
//-- Arithmetic instructions --//
-// HADD, HSUB PS/PD
-// PHADD|PHSUB (S) W/D.
-def : SchedAlias<WritePHAdd, Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddLd, Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddX, Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddXLd, Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddY, Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddYLd, Zn2WriteMicrocoded>;
-
// PCMPGTQ.
def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
@@ -1137,8 +1141,12 @@ def : InstRW<[Zn2WritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
// PSLL,PSRL,PSRA W/D/Q.
// x,x / v,v,x.
-def Zn2WritePShift : SchedWriteRes<[Zn2FPU2]> ;
-def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> ;
+def Zn2WritePShift : SchedWriteRes<[Zn2FPU2]> {
+ let Latency = 3;
+}
+def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> {
+ let Latency = 3;
+}
// PSLL,PSRL DQ.
def : InstRW<[Zn2WritePShift], (instregex "(V?)PS(R|L)LDQri")>;
@@ -1280,7 +1288,7 @@ def Zn2WriteCVTDQ2PDr: SchedWriteRes<[Zn2FPU12,Zn2FPU3]> {
}
// CVTDQ2PD.
// x,x.
-def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2P(D|S)rr")>;
// Same as xmm
// y,x.
@@ -1290,9 +1298,9 @@ def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PSYrr)>;
def Zn2WriteCVTPD2DQr: SchedWriteRes<[Zn2FPU12, Zn2FPU3]> {
let Latency = 3;
}
-// CVT(T)PD2DQ.
+// CVT(T)P(D|S)2DQ.
// x,x.
-def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)PD2DQrr")>;
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)P(D|S)2DQrr")>;
def Zn2WriteCVTPD2DQLd: SchedWriteRes<[Zn2AGU,Zn2FPU12,Zn2FPU3]> {
let Latency = 10;
@@ -1322,7 +1330,7 @@ def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> {
- let Latency = 4;
+ let Latency = 3;
}
// same as CVTPD2DQr
@@ -1334,7 +1342,7 @@ def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
def Zn2WriteCVSTSI2SDr: SchedWriteRes<[Zn2FPU013, Zn2FPU3]> {
- let Latency = 4;
+ let Latency = 3;
}
// CVTSI2SD.
// x,r32/64.
@@ -1376,7 +1384,7 @@ defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
//-- SSE4A instructions --//
// EXTRQ
def Zn2WriteEXTRQ: SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
- let Latency = 2;
+ let Latency = 3;
}
def : InstRW<[Zn2WriteEXTRQ], (instregex "EXTRQ")>;
@@ -1448,12 +1456,6 @@ def : InstRW<[Zn2WriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
//-- Arithmetic instructions --//
-// HADD, HSUB PS/PD
-def : SchedAlias<WriteFHAdd, Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddLd, Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddY, Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddYLd, Zn2WriteMicrocoded>;
-
// VDIVPS.
// TODO - convert to Zn2WriteResFpuPair
// y,y,y.
@@ -1490,11 +1492,9 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
// DPPS.
// x,x,i / v,v,v,i.
-def : SchedAlias<WriteDPPS, Zn2WriteMicrocoded>;
def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>;
// x,m,i / v,v,m,i.
-def : SchedAlias<WriteDPPSLd, Zn2WriteMicrocoded>;
def : SchedAlias<WriteDPPSYLd,Zn2WriteMicrocoded>;
// DPPD.
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index 1ae8df977f83..ce8d1d464da9 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -15,6 +15,7 @@
#include "X86InstrInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/DerivedTypes.h"
@@ -45,7 +46,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
- SDValue Size, unsigned Align, bool isVolatile,
+ SDValue Size, Align Alignment, bool isVolatile,
MachinePointerInfo DstPtrInfo) const {
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
const X86Subtarget &Subtarget =
@@ -65,7 +66,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
// If not DWORD aligned or size is more than the threshold, call the library.
// The libc version is likely to be faster for these cases. It can use the
// address value and run time information about the CPU.
- if ((Align & 3) != 0 || !ConstantSize ||
+ if (Alignment < Align(4) || !ConstantSize ||
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
// Check to see if there is a specialized entry-point for memory zeroing.
ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
@@ -111,28 +112,27 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
uint64_t Val = ValC->getZExtValue() & 255;
// If the value is a constant, then we can potentially use larger sets.
- switch (Align & 3) {
- case 2: // WORD aligned
- AVT = MVT::i16;
- ValReg = X86::AX;
- Val = (Val << 8) | Val;
- break;
- case 0: // DWORD aligned
+ if (Alignment > Align(2)) {
+ // DWORD aligned
AVT = MVT::i32;
ValReg = X86::EAX;
Val = (Val << 8) | Val;
Val = (Val << 16) | Val;
- if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned
+ if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
AVT = MVT::i64;
ValReg = X86::RAX;
Val = (Val << 32) | Val;
}
- break;
- default: // Byte aligned
+ } else if (Alignment == Align(2)) {
+ // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ } else {
+ // Byte aligned
AVT = MVT::i8;
ValReg = X86::AL;
Count = DAG.getIntPtrConstant(SizeVal, dl);
- break;
}
if (AVT.bitsGT(MVT::i8)) {
@@ -169,13 +169,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
EVT AddrVT = Dst.getValueType();
EVT SizeVT = Size.getValueType();
- Chain = DAG.getMemset(Chain, dl,
- DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
- DAG.getConstant(Offset, dl, AddrVT)),
- Val,
- DAG.getConstant(BytesLeft, dl, SizeVT),
- Align, isVolatile, false,
- DstPtrInfo.getWithOffset(Offset));
+ Chain =
+ DAG.getMemset(Chain, dl,
+ DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+ DAG.getConstant(Offset, dl, AddrVT)),
+ Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
+ isVolatile, false, DstPtrInfo.getWithOffset(Offset));
}
// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
@@ -283,7 +282,7 @@ static SDValue emitConstantSizeRepmov(
Chain, dl,
DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
- DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile,
+ DAG.getConstant(BytesLeft, dl, SizeVT), llvm::Align(Align), isVolatile,
/*AlwaysInline*/ true, /*isTailCall*/ false,
DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
@@ -291,7 +290,7 @@ static SDValue emitConstantSizeRepmov(
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+ SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
// If to a segment-relative address space, use the default lowering.
if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
@@ -309,10 +308,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
/// Handle constant sizes,
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
- return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
- ConstantSize->getZExtValue(),
- Size.getValueType(), Align, isVolatile,
- AlwaysInline, DstPtrInfo, SrcPtrInfo);
+ return emitConstantSizeRepmov(
+ DAG, Subtarget, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+ Size.getValueType(), Alignment.value(), isVolatile, AlwaysInline,
+ DstPtrInfo, SrcPtrInfo);
return SDValue();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h
index 0f2d979f91e3..dac62973636c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -14,14 +14,9 @@
#define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
namespace llvm {
-class X86TargetLowering;
-class X86TargetMachine;
-class X86Subtarget;
-
class X86SelectionDAGInfo : public SelectionDAGTargetInfo {
/// Returns true if it is possible for the base register to conflict with the
/// given set of clobbers for a memory intrinsic.
@@ -33,13 +28,14 @@ public:
SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile,
+ SDValue Size, Align Alignment,
+ bool isVolatile,
MachinePointerInfo DstPtrInfo) const override;
SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size, unsigned Align, bool isVolatile,
- bool AlwaysInline,
+ SDValue Size, Align Alignment,
+ bool isVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo,
MachinePointerInfo SrcPtrInfo) const override;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index a202fc63637b..de528299654c 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -11,8 +11,10 @@
//
//===----------------------------------------------------------------------===//
-#include "Utils/X86ShuffleDecode.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Constants.h"
//===----------------------------------------------------------------------===//
@@ -34,17 +36,17 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
//
// <4 x i32> <i32 -2147483648, i32 -2147483648,
// i32 -2147483648, i32 -2147483648>
- Type *CstTy = C->getType();
- if (!CstTy->isVectorTy())
+ auto *CstTy = dyn_cast<FixedVectorType>(C->getType());
+ if (!CstTy)
return false;
- Type *CstEltTy = CstTy->getVectorElementType();
+ Type *CstEltTy = CstTy->getElementType();
if (!CstEltTy->isIntegerTy())
return false;
unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
- unsigned NumCstElts = CstTy->getVectorNumElements();
+ unsigned NumCstElts = CstTy->getNumElements();
assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
"Unaligned shuffle mask size");
@@ -185,13 +187,12 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
}
void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
- unsigned Width,
- SmallVectorImpl<int> &ShuffleMask) {
+ unsigned Width, SmallVectorImpl<int> &ShuffleMask) {
Type *MaskTy = C->getType();
unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
(void)MaskTySize;
- assert((MaskTySize == 128 || MaskTySize == 256) &&
- Width >= MaskTySize && "Unexpected vector size.");
+ assert((MaskTySize == 128 || MaskTySize == 256) && Width >= MaskTySize &&
+ "Unexpected vector size.");
// The shuffle mask requires elements the same size as the target.
APInt UndefElts;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index 296341517579..51229a69a626 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -14,15 +14,13 @@
#ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
#define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
-#include "llvm/ADT/SmallVector.h"
-
//===----------------------------------------------------------------------===//
// Vector Mask Decoding
//===----------------------------------------------------------------------===//
namespace llvm {
class Constant;
-class MVT;
+template <typename T> class SmallVectorImpl;
/// Decode a PSHUFB mask from an IR-level vector constant.
void DecodePSHUFBMask(const Constant *C, unsigned Width,
@@ -33,9 +31,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPERMILP2 variable mask from an IR-level vector constant.
-void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
- unsigned Width,
- SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+ unsigned Width, SmallVectorImpl<int> &ShuffleMask);
/// Decode a VPPERM variable mask from an IR-level vector constant.
void DecodeVPPERMMask(const Constant *C, unsigned Width,
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
new file mode 100644
index 000000000000..7e91c37367d2
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
@@ -0,0 +1,181 @@
+//===-- X86SpeculativeExecutionSideEffectSuppression.cpp ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file contains the X86 implementation of the speculative execution side
+/// effect suppression mitigation.
+///
+/// This must be used with the -mlvi-cfi flag in order to mitigate indirect
+/// branches and returns.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-seses"
+
+STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
+
+static cl::opt<bool> EnableSpeculativeExecutionSideEffectSuppression(
+ "x86-seses-enable-without-lvi-cfi",
+ cl::desc("Force enable speculative execution side effect suppression. "
+ "(Note: User must pass -mlvi-cfi in order to mitigate indirect "
+ "branches and returns.)"),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> OneLFENCEPerBasicBlock(
+ "x86-seses-one-lfence-per-bb",
+ cl::desc(
+ "Omit all lfences other than the first to be placed in a basic block."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool> OnlyLFENCENonConst(
+ "x86-seses-only-lfence-non-const",
+ cl::desc("Only lfence before groups of terminators where at least one "
+ "branch instruction has an input to the addressing mode that is a "
+ "register other than %rip."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+ OmitBranchLFENCEs("x86-seses-omit-branch-lfences",
+ cl::desc("Omit all lfences before branch instructions."),
+ cl::init(false), cl::Hidden);
+
+namespace {
+
+class X86SpeculativeExecutionSideEffectSuppression
+ : public MachineFunctionPass {
+public:
+ X86SpeculativeExecutionSideEffectSuppression() : MachineFunctionPass(ID) {}
+
+ static char ID;
+ StringRef getPassName() const override {
+ return "X86 Speculative Execution Side Effect Suppression";
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace
+
+char X86SpeculativeExecutionSideEffectSuppression::ID = 0;
+
+// This function returns whether the passed instruction uses a memory addressing
+// mode that is constant. We treat all memory addressing modes that read
+// from a register that is not %rip as non-constant. Note that the use
+// of the EFLAGS register results in an addressing mode being considered
+// non-constant, therefore all JCC instructions will return false from this
+// function since one of their operands will always be the EFLAGS register.
+static bool hasConstantAddressingMode(const MachineInstr &MI) {
+ for (const MachineOperand &MO : MI.uses())
+ if (MO.isReg() && X86::RIP != MO.getReg())
+ return false;
+ return true;
+}
+
+bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction(
+ MachineFunction &MF) {
+
+ const auto &OptLevel = MF.getTarget().getOptLevel();
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+
+ // Check whether SESES needs to run as the fallback for LVI at O0, whether the
+ // user explicitly passed an SESES flag, or whether the SESES target feature
+ // was set.
+ if (!EnableSpeculativeExecutionSideEffectSuppression &&
+ !(Subtarget.useLVILoadHardening() && OptLevel == CodeGenOpt::None) &&
+ !Subtarget.useSpeculativeExecutionSideEffectSuppression())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+ << " **********\n");
+ bool Modified = false;
+ const X86InstrInfo *TII = Subtarget.getInstrInfo();
+ for (MachineBasicBlock &MBB : MF) {
+ MachineInstr *FirstTerminator = nullptr;
+ // Keep track of whether the previous instruction was an LFENCE to avoid
+ // adding redundant LFENCEs.
+ bool PrevInstIsLFENCE = false;
+ for (auto &MI : MBB) {
+
+ if (MI.getOpcode() == X86::LFENCE) {
+ PrevInstIsLFENCE = true;
+ continue;
+ }
+ // We want to put an LFENCE before any instruction that
+ // may load or store. This LFENCE is intended to avoid leaking any secret
+ // data due to a given load or store. This results in closing the cache
+ // and memory timing side channels. We will treat terminators that load
+ // or store separately.
+ if (MI.mayLoadOrStore() && !MI.isTerminator()) {
+ if (!PrevInstIsLFENCE) {
+ BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
+ NumLFENCEsInserted++;
+ Modified = true;
+ }
+ if (OneLFENCEPerBasicBlock)
+ break;
+ }
+ // The following section will be LFENCEing before groups of terminators
+ // that include branches. This will close the branch prediction side
+ // channels since we will prevent code executing after misspeculation as
+ // a result of the LFENCEs placed with this logic.
+
+ // Keep track of the first terminator in a basic block since if we need
+ // to LFENCE the terminators in this basic block we must add the
+ // instruction before the first terminator in the basic block (as
+ // opposed to before the terminator that indicates an LFENCE is
+ // required). An example of why this is necessary is that the
+ // X86InstrInfo::analyzeBranch method assumes all terminators are grouped
+ // together and terminates it's analysis once the first non-termintor
+ // instruction is found.
+ if (MI.isTerminator() && FirstTerminator == nullptr)
+ FirstTerminator = &MI;
+
+ // Look for branch instructions that will require an LFENCE to be put
+ // before this basic block's terminators.
+ if (!MI.isBranch() || OmitBranchLFENCEs) {
+ // This isn't a branch or we're not putting LFENCEs before branches.
+ PrevInstIsLFENCE = false;
+ continue;
+ }
+
+ if (OnlyLFENCENonConst && hasConstantAddressingMode(MI)) {
+ // This is a branch, but it only has constant addressing mode and we're
+ // not adding LFENCEs before such branches.
+ PrevInstIsLFENCE = false;
+ continue;
+ }
+
+ // This branch requires adding an LFENCE.
+ if (!PrevInstIsLFENCE) {
+ BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE));
+ NumLFENCEsInserted++;
+ Modified = true;
+ }
+ break;
+ }
+ }
+
+ return Modified;
+}
+
+FunctionPass *llvm::createX86SpeculativeExecutionSideEffectSuppression() {
+ return new X86SpeculativeExecutionSideEffectSuppression();
+}
+
+INITIALIZE_PASS(X86SpeculativeExecutionSideEffectSuppression, "x86-seses",
+ "X86 Speculative Execution Side Effect Suppression", false,
+ false)
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 9aa47c532e82..fe5b9a05f811 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -53,6 +53,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
#include <algorithm>
#include <cassert>
#include <iterator>
@@ -872,10 +873,10 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
case X86::FARCALL16m:
case X86::FARCALL32m:
- case X86::FARCALL64:
+ case X86::FARCALL64m:
case X86::FARJMP16m:
case X86::FARJMP32m:
- case X86::FARJMP64:
+ case X86::FARJMP64m:
// We cannot mitigate far jumps or calls, but we also don't expect them
// to be vulnerable to Spectre v1.2 style attacks.
continue;
@@ -920,6 +921,11 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
// Now stitch the new instructions into place and erase the old one.
for (auto *NewMI : NewMIs)
MBB.insert(MI.getIterator(), NewMI);
+
+ // Update the call site info.
+ if (MI.isCandidateForCallSiteEntry())
+ MF.eraseCallSiteInfo(&MI);
+
MI.eraseFromParent();
LLVM_DEBUG({
dbgs() << "Unfolded load successfully into:\n";
@@ -993,7 +999,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
case X86::FARJMP16m:
case X86::FARJMP32m:
- case X86::FARJMP64:
+ case X86::FARJMP64m:
// We cannot mitigate far jumps or calls, but we also don't expect them
// to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
continue;
@@ -1195,394 +1201,13 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
return CMovs;
}
-/// Returns true if the instruction has no behavior (specified or otherwise)
-/// that is based on the value of any of its register operands
-///
-/// A classical example of something that is inherently not data invariant is an
-/// indirect jump -- the destination is loaded into icache based on the bits set
-/// in the jump destination register.
-///
-/// FIXME: This should become part of our instruction tables.
-static bool isDataInvariant(MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- // By default, assume that the instruction is not data invariant.
- return false;
-
- // Some target-independent operations that trivially lower to data-invariant
- // instructions.
- case TargetOpcode::COPY:
- case TargetOpcode::INSERT_SUBREG:
- case TargetOpcode::SUBREG_TO_REG:
- return true;
-
- // On x86 it is believed that imul is constant time w.r.t. the loaded data.
- // However, they set flags and are perhaps the most surprisingly constant
- // time operations so we call them out here separately.
- case X86::IMUL16rr:
- case X86::IMUL16rri8:
- case X86::IMUL16rri:
- case X86::IMUL32rr:
- case X86::IMUL32rri8:
- case X86::IMUL32rri:
- case X86::IMUL64rr:
- case X86::IMUL64rri32:
- case X86::IMUL64rri8:
-
- // Bit scanning and counting instructions that are somewhat surprisingly
- // constant time as they scan across bits and do other fairly complex
- // operations like popcnt, but are believed to be constant time on x86.
- // However, these set flags.
- case X86::BSF16rr:
- case X86::BSF32rr:
- case X86::BSF64rr:
- case X86::BSR16rr:
- case X86::BSR32rr:
- case X86::BSR64rr:
- case X86::LZCNT16rr:
- case X86::LZCNT32rr:
- case X86::LZCNT64rr:
- case X86::POPCNT16rr:
- case X86::POPCNT32rr:
- case X86::POPCNT64rr:
- case X86::TZCNT16rr:
- case X86::TZCNT32rr:
- case X86::TZCNT64rr:
-
- // Bit manipulation instructions are effectively combinations of basic
- // arithmetic ops, and should still execute in constant time. These also
- // set flags.
- case X86::BLCFILL32rr:
- case X86::BLCFILL64rr:
- case X86::BLCI32rr:
- case X86::BLCI64rr:
- case X86::BLCIC32rr:
- case X86::BLCIC64rr:
- case X86::BLCMSK32rr:
- case X86::BLCMSK64rr:
- case X86::BLCS32rr:
- case X86::BLCS64rr:
- case X86::BLSFILL32rr:
- case X86::BLSFILL64rr:
- case X86::BLSI32rr:
- case X86::BLSI64rr:
- case X86::BLSIC32rr:
- case X86::BLSIC64rr:
- case X86::BLSMSK32rr:
- case X86::BLSMSK64rr:
- case X86::BLSR32rr:
- case X86::BLSR64rr:
- case X86::TZMSK32rr:
- case X86::TZMSK64rr:
-
- // Bit extracting and clearing instructions should execute in constant time,
- // and set flags.
- case X86::BEXTR32rr:
- case X86::BEXTR64rr:
- case X86::BEXTRI32ri:
- case X86::BEXTRI64ri:
- case X86::BZHI32rr:
- case X86::BZHI64rr:
-
- // Shift and rotate.
- case X86::ROL8r1: case X86::ROL16r1: case X86::ROL32r1: case X86::ROL64r1:
- case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
- case X86::ROL8ri: case X86::ROL16ri: case X86::ROL32ri: case X86::ROL64ri:
- case X86::ROR8r1: case X86::ROR16r1: case X86::ROR32r1: case X86::ROR64r1:
- case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
- case X86::ROR8ri: case X86::ROR16ri: case X86::ROR32ri: case X86::ROR64ri:
- case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1: case X86::SAR64r1:
- case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
- case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri: case X86::SAR64ri:
- case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1: case X86::SHL64r1:
- case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
- case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri: case X86::SHL64ri:
- case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1: case X86::SHR64r1:
- case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
- case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri: case X86::SHR64ri:
- case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
- case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
- case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
- case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
-
- // Basic arithmetic is constant time on the input but does set flags.
- case X86::ADC8rr: case X86::ADC8ri:
- case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8:
- case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8:
- case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32:
- case X86::ADD8rr: case X86::ADD8ri:
- case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8:
- case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8:
- case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32:
- case X86::AND8rr: case X86::AND8ri:
- case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8:
- case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8:
- case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32:
- case X86::OR8rr: case X86::OR8ri:
- case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8:
- case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8:
- case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32:
- case X86::SBB8rr: case X86::SBB8ri:
- case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8:
- case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8:
- case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32:
- case X86::SUB8rr: case X86::SUB8ri:
- case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8:
- case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8:
- case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32:
- case X86::XOR8rr: case X86::XOR8ri:
- case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8:
- case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8:
- case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32:
- // Arithmetic with just 32-bit and 64-bit variants and no immediates.
- case X86::ADCX32rr: case X86::ADCX64rr:
- case X86::ADOX32rr: case X86::ADOX64rr:
- case X86::ANDN32rr: case X86::ANDN64rr:
- // Unary arithmetic operations.
- case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
- case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
- case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
- // Check whether the EFLAGS implicit-def is dead. We assume that this will
- // always find the implicit-def because this code should only be reached
- // for instructions that do in fact implicitly def this.
- if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
- // If we would clobber EFLAGS that are used, just bail for now.
- LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
- MI.dump(); dbgs() << "\n");
- return false;
- }
-
- // Otherwise, fallthrough to handle these the same as instructions that
- // don't set EFLAGS.
- LLVM_FALLTHROUGH;
-
- // Unlike other arithmetic, NOT doesn't set EFLAGS.
- case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
-
- // Various move instructions used to zero or sign extend things. Note that we
- // intentionally don't support the _NOREX variants as we can't handle that
- // register constraint anyways.
- case X86::MOVSX16rr8:
- case X86::MOVSX32rr8: case X86::MOVSX32rr16:
- case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
- case X86::MOVZX16rr8:
- case X86::MOVZX32rr8: case X86::MOVZX32rr16:
- case X86::MOVZX64rr8: case X86::MOVZX64rr16:
- case X86::MOV32rr:
-
- // Arithmetic instructions that are both constant time and don't set flags.
- case X86::RORX32ri:
- case X86::RORX64ri:
- case X86::SARX32rr:
- case X86::SARX64rr:
- case X86::SHLX32rr:
- case X86::SHLX64rr:
- case X86::SHRX32rr:
- case X86::SHRX64rr:
-
- // LEA doesn't actually access memory, and its arithmetic is constant time.
- case X86::LEA16r:
- case X86::LEA32r:
- case X86::LEA64_32r:
- case X86::LEA64r:
- return true;
- }
-}
-
-/// Returns true if the instruction has no behavior (specified or otherwise)
-/// that is based on the value loaded from memory or the value of any
-/// non-address register operands.
-///
-/// For example, if the latency of the instruction is dependent on the
-/// particular bits set in any of the registers *or* any of the bits loaded from
-/// memory.
-///
-/// A classical example of something that is inherently not data invariant is an
-/// indirect jump -- the destination is loaded into icache based on the bits set
-/// in the jump destination register.
-///
-/// FIXME: This should become part of our instruction tables.
-static bool isDataInvariantLoad(MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- // By default, assume that the load will immediately leak.
- return false;
-
- // On x86 it is believed that imul is constant time w.r.t. the loaded data.
- // However, they set flags and are perhaps the most surprisingly constant
- // time operations so we call them out here separately.
- case X86::IMUL16rm:
- case X86::IMUL16rmi8:
- case X86::IMUL16rmi:
- case X86::IMUL32rm:
- case X86::IMUL32rmi8:
- case X86::IMUL32rmi:
- case X86::IMUL64rm:
- case X86::IMUL64rmi32:
- case X86::IMUL64rmi8:
-
- // Bit scanning and counting instructions that are somewhat surprisingly
- // constant time as they scan across bits and do other fairly complex
- // operations like popcnt, but are believed to be constant time on x86.
- // However, these set flags.
- case X86::BSF16rm:
- case X86::BSF32rm:
- case X86::BSF64rm:
- case X86::BSR16rm:
- case X86::BSR32rm:
- case X86::BSR64rm:
- case X86::LZCNT16rm:
- case X86::LZCNT32rm:
- case X86::LZCNT64rm:
- case X86::POPCNT16rm:
- case X86::POPCNT32rm:
- case X86::POPCNT64rm:
- case X86::TZCNT16rm:
- case X86::TZCNT32rm:
- case X86::TZCNT64rm:
-
- // Bit manipulation instructions are effectively combinations of basic
- // arithmetic ops, and should still execute in constant time. These also
- // set flags.
- case X86::BLCFILL32rm:
- case X86::BLCFILL64rm:
- case X86::BLCI32rm:
- case X86::BLCI64rm:
- case X86::BLCIC32rm:
- case X86::BLCIC64rm:
- case X86::BLCMSK32rm:
- case X86::BLCMSK64rm:
- case X86::BLCS32rm:
- case X86::BLCS64rm:
- case X86::BLSFILL32rm:
- case X86::BLSFILL64rm:
- case X86::BLSI32rm:
- case X86::BLSI64rm:
- case X86::BLSIC32rm:
- case X86::BLSIC64rm:
- case X86::BLSMSK32rm:
- case X86::BLSMSK64rm:
- case X86::BLSR32rm:
- case X86::BLSR64rm:
- case X86::TZMSK32rm:
- case X86::TZMSK64rm:
-
- // Bit extracting and clearing instructions should execute in constant time,
- // and set flags.
- case X86::BEXTR32rm:
- case X86::BEXTR64rm:
- case X86::BEXTRI32mi:
- case X86::BEXTRI64mi:
- case X86::BZHI32rm:
- case X86::BZHI64rm:
-
- // Basic arithmetic is constant time on the input but does set flags.
- case X86::ADC8rm:
- case X86::ADC16rm:
- case X86::ADC32rm:
- case X86::ADC64rm:
- case X86::ADCX32rm:
- case X86::ADCX64rm:
- case X86::ADD8rm:
- case X86::ADD16rm:
- case X86::ADD32rm:
- case X86::ADD64rm:
- case X86::ADOX32rm:
- case X86::ADOX64rm:
- case X86::AND8rm:
- case X86::AND16rm:
- case X86::AND32rm:
- case X86::AND64rm:
- case X86::ANDN32rm:
- case X86::ANDN64rm:
- case X86::OR8rm:
- case X86::OR16rm:
- case X86::OR32rm:
- case X86::OR64rm:
- case X86::SBB8rm:
- case X86::SBB16rm:
- case X86::SBB32rm:
- case X86::SBB64rm:
- case X86::SUB8rm:
- case X86::SUB16rm:
- case X86::SUB32rm:
- case X86::SUB64rm:
- case X86::XOR8rm:
- case X86::XOR16rm:
- case X86::XOR32rm:
- case X86::XOR64rm:
- // Check whether the EFLAGS implicit-def is dead. We assume that this will
- // always find the implicit-def because this code should only be reached
- // for instructions that do in fact implicitly def this.
- if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
- // If we would clobber EFLAGS that are used, just bail for now.
- LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: ";
- MI.dump(); dbgs() << "\n");
- return false;
- }
-
- // Otherwise, fallthrough to handle these the same as instructions that
- // don't set EFLAGS.
- LLVM_FALLTHROUGH;
-
- // Integer multiply w/o affecting flags is still believed to be constant
- // time on x86. Called out separately as this is among the most surprising
- // instructions to exhibit that behavior.
- case X86::MULX32rm:
- case X86::MULX64rm:
-
- // Arithmetic instructions that are both constant time and don't set flags.
- case X86::RORX32mi:
- case X86::RORX64mi:
- case X86::SARX32rm:
- case X86::SARX64rm:
- case X86::SHLX32rm:
- case X86::SHLX64rm:
- case X86::SHRX32rm:
- case X86::SHRX64rm:
-
- // Conversions are believed to be constant time and don't set flags.
- case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
- case X86::CVTTSD2SIrm: case X86::VCVTTSD2SIrm: case X86::VCVTTSD2SIZrm:
- case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
- case X86::CVTTSS2SIrm: case X86::VCVTTSS2SIrm: case X86::VCVTTSS2SIZrm:
- case X86::CVTSI2SDrm: case X86::VCVTSI2SDrm: case X86::VCVTSI2SDZrm:
- case X86::CVTSI2SSrm: case X86::VCVTSI2SSrm: case X86::VCVTSI2SSZrm:
- case X86::CVTSI642SDrm: case X86::VCVTSI642SDrm: case X86::VCVTSI642SDZrm:
- case X86::CVTSI642SSrm: case X86::VCVTSI642SSrm: case X86::VCVTSI642SSZrm:
- case X86::CVTSS2SDrm: case X86::VCVTSS2SDrm: case X86::VCVTSS2SDZrm:
- case X86::CVTSD2SSrm: case X86::VCVTSD2SSrm: case X86::VCVTSD2SSZrm:
- // AVX512 added unsigned integer conversions.
- case X86::VCVTTSD2USI64Zrm:
- case X86::VCVTTSD2USIZrm:
- case X86::VCVTTSS2USI64Zrm:
- case X86::VCVTTSS2USIZrm:
- case X86::VCVTUSI2SDZrm:
- case X86::VCVTUSI642SDZrm:
- case X86::VCVTUSI2SSZrm:
- case X86::VCVTUSI642SSZrm:
-
- // Loads to register don't set flags.
- case X86::MOV8rm:
- case X86::MOV8rm_NOREX:
- case X86::MOV16rm:
- case X86::MOV32rm:
- case X86::MOV64rm:
- case X86::MOVSX16rm8:
- case X86::MOVSX32rm16:
- case X86::MOVSX32rm8:
- case X86::MOVSX32rm8_NOREX:
- case X86::MOVSX64rm16:
- case X86::MOVSX64rm32:
- case X86::MOVSX64rm8:
- case X86::MOVZX16rm8:
- case X86::MOVZX32rm16:
- case X86::MOVZX32rm8:
- case X86::MOVZX32rm8_NOREX:
- case X86::MOVZX64rm16:
- case X86::MOVZX64rm8:
- return true;
+// Returns true if the MI has EFLAGS as a register def operand and it's live,
+// otherwise it returns false
+static bool isEFLAGSDefLive(const MachineInstr &MI) {
+ if (const MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
+ return !DefOp->isDead();
}
+ return false;
}
static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -1740,8 +1365,9 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
// address registers, queue it up to be hardened post-load. Notably,
// even once hardened this won't introduce a useful dependency that
// could prune out subsequent loads.
- if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
- MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
+ if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
+ !isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == 1 &&
+ MI.getOperand(0).isReg() &&
canHardenRegister(MI.getOperand(0).getReg()) &&
!HardenedAddrRegs.count(BaseReg) &&
!HardenedAddrRegs.count(IndexReg)) {
@@ -1795,9 +1421,10 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
if (HardenPostLoad.erase(&MI)) {
assert(!MI.isCall() && "Must not try to post-load harden a call!");
- // If this is a data-invariant load, we want to try and sink any
- // hardening as far as possible.
- if (isDataInvariantLoad(MI)) {
+ // If this is a data-invariant load and there is no EFLAGS
+ // interference, we want to try and sink any hardening as far as
+ // possible.
+ if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
// Sink the instruction we'll need to harden as far as we can down
// the graph.
MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
@@ -2085,9 +1712,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
// Broadcast our state into a vector register.
Register VStateReg = MRI->createVirtualRegister(OpRC);
- unsigned BroadcastOp =
- Is128Bit ? X86::VPBROADCASTQrZ128r
- : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
+ unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
+ : Is256Bit ? X86::VPBROADCASTQrZ256rr
+ : X86::VPBROADCASTQrZrr;
auto BroadcastI =
BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
.addReg(StateReg);
@@ -2147,8 +1774,11 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
- assert(isDataInvariantLoad(InitialMI) &&
+ assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
"Cannot get here with a non-invariant load!");
+ assert(!isEFLAGSDefLive(InitialMI) &&
+ "Cannot get here with a data invariant load "
+ "that interferes with EFLAGS!");
// See if we can sink hardening the loaded value.
auto SinkCheckToSingleUse =
@@ -2160,14 +1790,14 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// own.
MachineInstr *SingleUseMI = nullptr;
for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
- // If we're already going to harden this use, it is data invariant and
- // within our block.
+ // If we're already going to harden this use, it is data invariant, it
+ // does not interfere with EFLAGS, and within our block.
if (HardenedInstrs.count(&UseMI)) {
- if (!isDataInvariantLoad(UseMI)) {
+ if (!X86InstrInfo::isDataInvariantLoad(UseMI) || isEFLAGSDefLive(UseMI)) {
// If we've already decided to harden a non-load, we must have sunk
// some other post-load hardened instruction to it and it must itself
// be data-invariant.
- assert(isDataInvariant(UseMI) &&
+ assert(X86InstrInfo::isDataInvariant(UseMI) &&
"Data variant instruction being hardened!");
continue;
}
@@ -2199,7 +1829,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
// If this single use isn't data invariant, isn't in this block, or has
// interfering EFLAGS, we can't sink the hardening to it.
- if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
+ if (!X86InstrInfo::isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent() ||
+ isEFLAGSDefLive(UseMI))
return {};
// If this instruction defines multiple registers bail as we won't harden
@@ -2590,10 +2221,10 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
switch (MI.getOpcode()) {
case X86::FARCALL16m:
case X86::FARCALL32m:
- case X86::FARCALL64:
+ case X86::FARCALL64m:
case X86::FARJMP16m:
case X86::FARJMP32m:
- case X86::FARJMP64:
+ case X86::FARJMP64m:
// We don't need to harden either far calls or far jumps as they are
// safe from Spectre.
return;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
index 75c3a70b430a..975cbabb30fd 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -10,14 +10,13 @@
//
//===----------------------------------------------------------------------===//
+#include "X86Subtarget.h"
+#include "MCTargetDesc/X86BaseInfo.h"
#include "X86.h"
-
#include "X86CallLowering.h"
#include "X86LegalizerInfo.h"
#include "X86MacroFusion.h"
#include "X86RegisterBankInfo.h"
-#include "X86Subtarget.h"
-#include "MCTargetDesc/X86BaseInfo.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/Triple.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
@@ -89,7 +88,9 @@ X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
// Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data.
case CodeModel::Medium:
- if (isa<Function>(GV))
+ // Constant pool and jump table handling pass a nullptr to this
+ // function so we need to use isa_and_nonnull.
+ if (isa_and_nonnull<Function>(GV))
return X86II::MO_NO_FLAG; // All code is RIP-relative
return X86II::MO_GOTOFF; // Local symbols use GOTOFF.
}
@@ -227,11 +228,11 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const {
}
void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
- std::string CPUName = CPU;
+ std::string CPUName = std::string(CPU);
if (CPUName.empty())
CPUName = "generic";
- std::string FullFS = FS;
+ std::string FullFS = std::string(FS);
if (In64BitMode) {
// SSE2 should default to enabled in 64-bit mode, but can be turned off
// explicitly.
@@ -379,3 +380,7 @@ void X86Subtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
Mutations.push_back(createX86MacroFusionDAGMutation());
}
+
+bool X86Subtarget::isPositionIndependent() const {
+ return TM.isPositionIndependent();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
index af5153243c8b..de45d357e3c2 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86Subtarget.h
@@ -17,15 +17,9 @@
#include "X86ISelLowering.h"
#include "X86InstrInfo.h"
#include "X86SelectionDAGInfo.h"
-#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetMachine.h"
#include <climits>
#include <memory>
@@ -34,7 +28,13 @@
namespace llvm {
+class CallLowering;
class GlobalValue;
+class InstructionSelector;
+class LegalizerInfo;
+class RegisterBankInfo;
+class StringRef;
+class TargetMachine;
/// The X86 backend supports a number of different styles of PIC.
///
@@ -258,6 +258,10 @@ protected:
bool InsertVZEROUPPER = false;
/// True if there is no performance penalty for writing NOPs with up to
+ /// 7 bytes.
+ bool HasFast7ByteNOP = false;
+
+ /// True if there is no performance penalty for writing NOPs with up to
/// 11 bytes.
bool HasFast11ByteNOP = false;
@@ -393,6 +397,17 @@ protected:
/// Processor supports PCONFIG instruction
bool HasPCONFIG = false;
+ /// Processor supports SERIALIZE instruction
+ bool HasSERIALIZE = false;
+
+ /// Processor supports TSXLDTRK instruction
+ bool HasTSXLDTRK = false;
+
+ /// Processor has AMX support
+ bool HasAMXTILE = false;
+ bool HasAMXBF16 = false;
+ bool HasAMXINT8 = false;
+
/// Processor has a single uop BEXTR implementation.
bool HasFastBEXTR = false;
@@ -427,6 +442,9 @@ protected:
/// POP+LFENCE+JMP sequence.
bool UseLVIControlFlowIntegrity = false;
+ /// Enable Speculative Execution Side Effect Suppression
+ bool UseSpeculativeExecutionSideEffectSuppression = false;
+
/// Insert LFENCE instructions to prevent data speculatively injected into
/// loads from being used maliciously.
bool UseLVILoadHardening = false;
@@ -637,8 +655,15 @@ public:
bool hasRTM() const { return HasRTM; }
bool hasADX() const { return HasADX; }
bool hasSHA() const { return HasSHA; }
- bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; }
+ bool hasPRFCHW() const { return HasPRFCHW; }
bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
+ bool hasPrefetchW() const {
+ // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
+ // its own CPUID bit as part of deprecating 3DNow. Intel eventually added
+ // it and KNL has another that prefetches to L2 cache. We assume the
+ // L1 version exists if the L2 version does.
+ return has3DNow() || hasPRFCHW() || hasPREFETCHWT1();
+ }
bool hasSSEPrefetch() const {
// We implicitly enable these when we have a write prefix supporting cache
// level OR if we have prfchw, but don't already have a read prefetch from
@@ -712,10 +737,15 @@ public:
bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
bool hasINVPCID() const { return HasINVPCID; }
bool hasENQCMD() const { return HasENQCMD; }
+ bool hasSERIALIZE() const { return HasSERIALIZE; }
+ bool hasTSXLDTRK() const { return HasTSXLDTRK; }
bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
bool useRetpolineIndirectBranches() const {
return UseRetpolineIndirectBranches;
}
+ bool hasAMXTILE() const { return HasAMXTILE; }
+ bool hasAMXBF16() const { return HasAMXBF16; }
+ bool hasAMXINT8() const { return HasAMXINT8; }
bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
// These are generic getters that OR together all of the thunk types
@@ -732,6 +762,9 @@ public:
bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
bool useLVILoadHardening() const { return UseLVILoadHardening; }
+ bool useSpeculativeExecutionSideEffectSuppression() const {
+ return UseSpeculativeExecutionSideEffectSuppression;
+ }
unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
@@ -829,7 +862,7 @@ public:
return PICStyle == PICStyles::Style::StubPIC;
}
- bool isPositionIndependent() const { return TM.isPositionIndependent(); }
+ bool isPositionIndependent() const;
bool isCallingConvWin64(CallingConv::ID CC) const {
switch (CC) {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9f639ffa22ec..7344116e14af 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -73,18 +73,22 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
initializeEvexToVexInstPassPass(PR);
initializeFixupLEAPassPass(PR);
initializeFPSPass(PR);
+ initializeX86FixupSetCCPassPass(PR);
initializeX86CallFrameOptimizationPass(PR);
initializeX86CmovConverterPassPass(PR);
initializeX86ExpandPseudoPass(PR);
initializeX86ExecutionDomainFixPass(PR);
initializeX86DomainReassignmentPass(PR);
initializeX86AvoidSFBPassPass(PR);
+ initializeX86AvoidTrailingCallPassPass(PR);
initializeX86SpeculativeLoadHardeningPassPass(PR);
+ initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR);
initializeX86FlagsCopyLoweringPassPass(PR);
initializeX86CondBrFoldingPassPass(PR);
initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
initializeX86LoadValueInjectionRetHardeningPassPass(PR);
initializeX86OptimizeLEAPassPass(PR);
+ initializeX86PartialReductionPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -94,19 +98,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return std::make_unique<TargetLoweringObjectFileMachO>();
}
- if (TT.isOSFreeBSD())
- return std::make_unique<X86FreeBSDTargetObjectFile>();
- if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
- return std::make_unique<X86LinuxNaClTargetObjectFile>();
- if (TT.isOSSolaris())
- return std::make_unique<X86SolarisTargetObjectFile>();
- if (TT.isOSFuchsia())
- return std::make_unique<X86FuchsiaTargetObjectFile>();
- if (TT.isOSBinFormatELF())
- return std::make_unique<X86ELFTargetObjectFile>();
if (TT.isOSBinFormatCOFF())
return std::make_unique<TargetLoweringObjectFileCOFF>();
- llvm_unreachable("unknown subtarget type");
+ return std::make_unique<X86ELFTargetObjectFile>();
}
static std::string computeDataLayout(const Triple &TT) {
@@ -234,6 +228,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
setMachineOutliner(true);
+ // x86 supports the debug entry values.
+ setSupportsDebugEntryValues(true);
+
initAsmInfo();
}
@@ -317,14 +314,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
}
//===----------------------------------------------------------------------===//
-// Command line options for x86
-//===----------------------------------------------------------------------===//
-static cl::opt<bool>
-UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
- cl::desc("Minimize AVX to SSE transition penalty"),
- cl::init(true));
-
-//===----------------------------------------------------------------------===//
// X86 TTI query.
//===----------------------------------------------------------------------===//
@@ -408,8 +397,10 @@ void X86PassConfig::addIRPasses() {
TargetPassConfig::addIRPasses();
- if (TM->getOptLevel() != CodeGenOpt::None)
+ if (TM->getOptLevel() != CodeGenOpt::None) {
addPass(createInterleavedAccessPass());
+ addPass(createX86PartialReductionPass());
+ }
// Add passes that handle indirect branch removal and insertion of a retpoline
// thunk. These will be a no-op unless a function subtarget has the retpoline
@@ -498,10 +489,12 @@ void X86PassConfig::addMachineSSAOptimization() {
void X86PassConfig::addPostRegAlloc() {
addPass(createX86FloatingPointStackifierPass());
+ // When -O0 is enabled, the Load Value Injection Hardening pass will fall back
+ // to using the Speculative Execution Side Effect Suppression pass for
+ // mitigation. This is to prevent slow downs due to
+ // analyses needed by the LVIHardening pass when compiling at -O0.
if (getOptLevel() != CodeGenOpt::None)
addPass(createX86LoadValueInjectionLoadHardeningPass());
- else
- addPass(createX86LoadValueInjectionLoadHardeningUnoptimizedPass());
}
void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
@@ -514,23 +507,33 @@ void X86PassConfig::addPreEmitPass() {
addPass(createX86IndirectBranchTrackingPass());
- if (UseVZeroUpper)
- addPass(createX86IssueVZeroUpperPass());
+ addPass(createX86IssueVZeroUpperPass());
if (getOptLevel() != CodeGenOpt::None) {
addPass(createX86FixupBWInsts());
addPass(createX86PadShortFunctions());
addPass(createX86FixupLEAs());
- addPass(createX86EvexToVexInsts());
}
+ addPass(createX86EvexToVexInsts());
addPass(createX86DiscriminateMemOpsPass());
addPass(createX86InsertPrefetchPass());
+ addPass(createX86InsertX87waitPass());
}
void X86PassConfig::addPreEmitPass2() {
const Triple &TT = TM->getTargetTriple();
const MCAsmInfo *MAI = TM->getMCAsmInfo();
+ // The X86 Speculative Execution Pass must run after all control
+ // flow graph modifying passes. As a result it was listed to run right before
+ // the X86 Retpoline Thunks pass. The reason it must run after control flow
+ // graph modifications is that the model of LFENCE in LLVM has to be updated
+ // (FIXME: https://bugs.llvm.org/show_bug.cgi?id=45167). Currently the
+ // placement of this pass was hand checked to ensure that the subsequent
+ // passes don't move the code around the LFENCEs in a way that will hurt the
+ // correctness of this pass. This placement has been shown to work based on
+ // hand inspection of the codegen output.
+ addPass(createX86SpeculativeExecutionSideEffectSuppression());
addPass(createX86IndirectThunksPass());
// Insert extra int3 instructions after trailing call instructions to avoid
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
index 757ce8bc5c72..8d98474a39c0 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.h
@@ -23,8 +23,6 @@
namespace llvm {
class StringRef;
-class X86Subtarget;
-class X86RegisterBankInfo;
class TargetTransformInfo;
class X86TargetMachine final : public LLVMTargetMachine {
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index 44185957686b..2b48baccc01f 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -18,6 +18,7 @@
#include "llvm/MC/MCSectionCOFF.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCValue.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
using namespace dwarf;
@@ -63,30 +64,3 @@ const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
const MCSymbol *Sym) const {
return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
}
-
-void
-X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx,
- const TargetMachine &TM) {
- TargetLoweringObjectFileELF::Initialize(Ctx, TM);
- InitializeELF(TM.Options.UseInitArray);
-}
-
-void
-X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx,
- const TargetMachine &TM) {
- TargetLoweringObjectFileELF::Initialize(Ctx, TM);
- InitializeELF(TM.Options.UseInitArray);
-}
-
-void
-X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
- const TargetMachine &TM) {
- TargetLoweringObjectFileELF::Initialize(Ctx, TM);
- InitializeELF(TM.Options.UseInitArray);
-}
-
-void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
- const TargetMachine &TM) {
- TargetLoweringObjectFileELF::Initialize(Ctx, TM);
- InitializeELF(TM.Options.UseInitArray);
-}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
index 1fd0bbf56b19..acea772eb036 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -10,7 +10,6 @@
#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
namespace llvm {
@@ -44,33 +43,10 @@ namespace llvm {
X86ELFTargetObjectFile() {
PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
}
-
/// Describe a TLS variable address within debug info.
const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
};
- /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD
- /// on x86 and x86-64.
- class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile {
- void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
- };
-
- /// This implementation is used for Fuchsia on x86-64.
- class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile {
- void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
- };
-
- /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and
- /// Native Client on x86 and x86-64.
- class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile {
- void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
- };
-
- /// This implementation is used for Solaris on x86/x86-64.
- class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
- void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
- };
-
} // end namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b754836ea517..cc18e55656ef 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -170,12 +170,18 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
}
int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind,
TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info,
TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
+ // TODO: Handle more cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -256,20 +262,25 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// The OperandValue properties may not be the same as that of the previous
// operation; conservatively assume OP_None.
int Cost =
- 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
+ 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
+ Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
+ Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
+ Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
if (ISD == ISD::SREM) {
// For SREM: (X % C) is the equivalent of (X - (X/C)*C)
- Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
- Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
+ Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
+ Op2Info);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
+ Op2Info);
}
return Cost;
@@ -277,12 +288,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// Vector unsigned division/remainder will be simplified to shifts/masks.
if (ISD == ISD::UDIV)
- return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
+ Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
else // UREM
- return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
+ return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
+ Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
}
@@ -304,6 +317,10 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v2i64, 1 },
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -370,6 +387,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
{ ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+ { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
};
if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
@@ -446,11 +471,32 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX512BWShiftCostTable[] = {
+ { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry AVX2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i16, 1 }, // psllw.
{ ISD::SRL, MVT::v16i16, 1 }, // psrlw.
{ ISD::SRA, MVT::v16i16, 1 }, // psraw.
+ { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
+ { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
+ { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
};
if (ST->hasAVX2() &&
@@ -495,18 +541,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWCostTable[] = {
- { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
-
- { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
-
- { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
-
{ ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
@@ -533,6 +567,7 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
+ { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
@@ -568,6 +603,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRL, MVT::v4i64, 1 },
};
+ if (ST->hasAVX512()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX512, a packed v32i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+ Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ }
+
// Look for AVX2 lowering tricks.
if (ST->hasAVX2()) {
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
@@ -575,7 +622,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
// On AVX2, a packed v16i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
- return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
+ return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+ Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -667,13 +715,19 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
static const CostTblEntry AVX2CostTable[] = {
{ ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
{ ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
{ ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
{ ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
@@ -877,20 +931,20 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
ISD == ISD::UDIV || ISD == ISD::UREM)) {
int ScalarCost = getArithmeticInstrCost(
- Opcode, Ty->getScalarType(), Op1Info, Op2Info,
+ Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
}
// Fallback to the default implementation.
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
}
-int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
+ int Index, VectorType *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
if (Kind == TTI::SK_Transpose)
@@ -919,19 +973,19 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// FIXME: Remove some of the alignment restrictions.
// FIXME: We can use permq for 64-bit or larger extracts from 256-bit
// vectors.
- int OrigSubElts = SubTp->getVectorNumElements();
- if (NumSubElts > OrigSubElts &&
- (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
+ int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
+ if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
+ (NumSubElts % OrigSubElts) == 0 &&
LT.second.getVectorElementType() ==
- SubLT.second.getVectorElementType() &&
+ SubLT.second.getVectorElementType() &&
LT.second.getVectorElementType().getSizeInBits() ==
- Tp->getVectorElementType()->getPrimitiveSizeInBits()) {
+ BaseTp->getElementType()->getPrimitiveSizeInBits()) {
assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
"Unexpected number of elements!");
- Type *VecTy = VectorType::get(Tp->getVectorElementType(),
- LT.second.getVectorNumElements());
- Type *SubTy = VectorType::get(Tp->getVectorElementType(),
- SubLT.second.getVectorNumElements());
+ auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
+ LT.second.getVectorNumElements());
+ auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
+ SubLT.second.getVectorNumElements());
int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
ExtractIndex, SubTy);
@@ -949,6 +1003,42 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
}
}
+ // Handle some common (illegal) sub-vector types as they are often very cheap
+ // to shuffle even on targets without PSHUFB.
+ EVT VT = TLI->getValueType(DL, BaseTp);
+ if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
+ !ST->hasSSSE3()) {
+ static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
+ {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
+ {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
+
+ {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
+ {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
+
+ {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
+ };
+
+ if (ST->hasSSE2())
+ if (const auto *Entry =
+ CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
+ return Entry->Cost;
+ }
+
// We are going to permute multiple sources and the result will be in multiple
// destinations. Providing an accurate cost only for splits where the element
// type remains the same.
@@ -956,25 +1046,26 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
MVT LegalVT = LT.second;
if (LegalVT.isVector() &&
LegalVT.getVectorElementType().getSizeInBits() ==
- Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
- LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+ BaseTp->getElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() <
+ cast<FixedVectorType>(BaseTp)->getNumElements()) {
- unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
unsigned LegalVTSize = LegalVT.getStoreSize();
// Number of source vectors after legalization:
unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Number of destination vectors after legalization:
unsigned NumOfDests = LT.first;
- Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
- LegalVT.getVectorNumElements());
+ auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
+ LegalVT.getVectorNumElements());
unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
return NumOfShuffles *
getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
}
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
}
// For 2-input shuffles, we must account for splitting the 2 inputs into many.
@@ -992,9 +1083,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
- {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
- {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
- {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
};
if (ST->hasVBMI())
@@ -1006,22 +1097,18 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
- {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
- {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
+ {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
- {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
- {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
- {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
- {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
};
if (ST->hasBWI())
@@ -1034,6 +1121,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
{TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
{TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
@@ -1065,7 +1154,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
{TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
{TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
- {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
+
+ // FIXME: This just applies the type legalization cost rules above
+ // assuming these completely split.
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
};
if (ST->hasAVX512())
@@ -1267,14 +1363,22 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
}
int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ // TODO: Allow non-throughput costs that aren't binary.
+ auto AdjustCost = [&CostKind](int Cost) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost == 0 ? 0 : 1;
+ return Cost;
+ };
+
// FIXME: Need a better design of the cost table to handle non-simple types of
// potential massive combinations (elem_num x src_type x dst_type).
@@ -1283,6 +1387,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
// Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
@@ -1290,42 +1399,45 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
- // Mask zero extend is a load + broadcast.
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
+ { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
};
static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
- { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
- { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
- { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
- { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
- { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
- { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
- { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
- { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
};
@@ -1337,14 +1449,70 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 },
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
+
+ // Sign extend is zmm vpternlogd+vptruncdb.
+ // Zero extend is zmm broadcast load+vptruncdw.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
+
+ // Sign extend is zmm vpternlogd+vptruncdw.
+ // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+
+ { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
+
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
- // v16i1 -> v16i32 - load + broadcast
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
@@ -1356,6 +1524,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
@@ -1367,44 +1538,163 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
+ // Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
+ };
+
+ static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
+
+ // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
+ // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
+
+ // sign extend is vpcmpeq+maskedmove+vpmovdw
+ // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
+
+ { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
+
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
+
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
- { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
- { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 },
- { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 },
- { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
- { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 },
- { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 },
};
static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -1416,6 +1706,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
@@ -1424,13 +1716,16 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
@@ -1447,6 +1742,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
@@ -1456,15 +1753,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
+
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
@@ -1503,8 +1806,15 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
// This node is expanded into scalarized operations but BasicTTI is overly
// optimistic estimating its cost. It computes 3 per element (one
// vector-extract, one scalar conversion and one vector-insert). The
@@ -1544,7 +1854,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 },
+ // These truncates end up widening elements.
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
+
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
@@ -1555,6 +1871,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
+
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
};
static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
@@ -1580,16 +1903,26 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
- { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
@@ -1616,11 +1949,19 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+ // These truncates are really widening elements.
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
+
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
{ ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
@@ -1639,7 +1980,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (ST->hasSSE2() && !ST->hasAVX()) {
if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
LTDest.second, LTSrc.second))
- return LTSrc.first * Entry->Cost;
+ return AdjustCost(LTSrc.first * Entry->Cost);
}
EVT SrcTy = TLI->getValueType(DL, Src);
@@ -1647,61 +1988,77 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// The function getSimpleVT only handles simple value types.
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind));
MVT SimpleSrcTy = SrcTy.getSimpleVT();
MVT SimpleDstTy = DstTy.getSimpleVT();
- // Make sure that neither type is going to be split before using the
- // AVX512 tables. This handles -mprefer-vector-width=256
- // with -min-legal-vector-width<=256
- if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
- TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
+ if (ST->useAVX512Regs()) {
if (ST->hasBWI())
if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
if (ST->hasDQI())
if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
if (ST->hasAVX512())
if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
if (ST->hasAVX2()) {
if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
if (ST->hasAVX()) {
if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
if (ST->hasSSE41()) {
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
if (ST->hasSSE2()) {
if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
}
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -1774,6 +2131,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SELECT, MVT::v16i32, 1 },
{ ISD::SELECT, MVT::v8f64, 1 },
{ ISD::SELECT, MVT::v16f32, 1 },
+
+ { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
+ { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
+
+ { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
+ { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
};
static const CostTblEntry AVX2CostTbl[] = {
@@ -1878,14 +2241,14 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * (ExtraCost + Entry->Cost);
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
}
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
-int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF,
- unsigned ScalarizationCostPassed) {
+int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
+ const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
+
// Costs should match the codegen from:
// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
// BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
@@ -1935,12 +2298,20 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
static const CostTblEntry AVX512CostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },
+ { ISD::BITREVERSE, MVT::v32i16, 10 },
+ { ISD::BITREVERSE, MVT::v64i8, 10 },
{ ISD::CTLZ, MVT::v8i64, 29 },
{ ISD::CTLZ, MVT::v16i32, 35 },
+ { ISD::CTLZ, MVT::v32i16, 28 },
+ { ISD::CTLZ, MVT::v64i8, 18 },
{ ISD::CTPOP, MVT::v8i64, 16 },
{ ISD::CTPOP, MVT::v16i32, 24 },
+ { ISD::CTPOP, MVT::v32i16, 18 },
+ { ISD::CTPOP, MVT::v64i8, 12 },
{ ISD::CTTZ, MVT::v8i64, 20 },
{ ISD::CTTZ, MVT::v16i32, 28 },
+ { ISD::CTTZ, MVT::v32i16, 24 },
+ { ISD::CTTZ, MVT::v64i8, 18 },
{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
@@ -1949,6 +2320,22 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
+ { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::FMAXNUM, MVT::f32, 2 },
+ { ISD::FMAXNUM, MVT::v4f32, 2 },
+ { ISD::FMAXNUM, MVT::v8f32, 2 },
+ { ISD::FMAXNUM, MVT::v16f32, 2 },
+ { ISD::FMAXNUM, MVT::f64, 2 },
+ { ISD::FMAXNUM, MVT::v2f64, 2 },
+ { ISD::FMAXNUM, MVT::v4f64, 2 },
+ { ISD::FMAXNUM, MVT::v8f64, 2 },
};
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@@ -2031,6 +2418,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
+ { ISD::FMAXNUM, MVT::f32, 3 },
+ { ISD::FMAXNUM, MVT::v4f32, 3 },
+ { ISD::FMAXNUM, MVT::v8f32, 5 },
+ { ISD::FMAXNUM, MVT::f64, 3 },
+ { ISD::FMAXNUM, MVT::v2f64, 3 },
+ { ISD::FMAXNUM, MVT::v4f64, 5 },
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
@@ -2105,13 +2498,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::UADDSAT, MVT::v16i8, 1 },
{ ISD::USUBSAT, MVT::v8i16, 1 },
{ ISD::USUBSAT, MVT::v16i8, 1 },
+ { ISD::FMAXNUM, MVT::f64, 4 },
+ { ISD::FMAXNUM, MVT::v2f64, 4 },
{ ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
};
static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::FMAXNUM, MVT::f32, 4 },
+ { ISD::FMAXNUM, MVT::v4f32, 4 },
{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
+ static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
+ { ISD::CTTZ, MVT::i64, 1 },
+ };
+ static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTTZ, MVT::i32, 1 },
+ { ISD::CTTZ, MVT::i16, 1 },
+ { ISD::CTTZ, MVT::i8, 1 },
+ };
static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
{ ISD::CTLZ, MVT::i64, 1 },
};
@@ -2131,6 +2536,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::BITREVERSE, MVT::i64, 14 },
{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },
@@ -2142,6 +2548,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i32, 8 },
{ ISD::CTPOP, MVT::i16, 9 },
{ ISD::CTPOP, MVT::i8, 7 },
@@ -2153,7 +2562,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::UADDO, MVT::i8, 1 },
};
+ Type *RetTy = ICA.getReturnType();
Type *OpTy = RetTy;
+ Intrinsic::ID IID = ICA.getID();
unsigned ISD = ISD::DELETED_NODE;
switch (IID) {
default:
@@ -2173,6 +2584,11 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
case Intrinsic::cttz:
ISD = ISD::CTTZ;
break;
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ // FMINNUM has same costs so don't duplicate.
+ ISD = ISD::FMAXNUM;
+ break;
case Intrinsic::sadd_sat:
ISD = ISD::SADDSAT;
break;
@@ -2256,6 +2672,15 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
+ if (ST->hasBMI()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
if (ST->hasLZCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
@@ -2284,12 +2709,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
return LT.first * Entry->Cost;
}
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF,
- unsigned VF) {
+int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+ if (ICA.isTypeBasedOnly())
+ return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
static const CostTblEntry AVX512CostTbl[] = {
{ ISD::ROTL, MVT::v8i64, 1 },
{ ISD::ROTL, MVT::v4i64, 1 },
@@ -2340,6 +2770,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSHL, MVT::i8, 4 }
};
+ Intrinsic::ID IID = ICA.getID();
+ Type *RetTy = ICA.getReturnType();
+ const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
unsigned ISD = ISD::DELETED_NODE;
switch (IID) {
default:
@@ -2379,7 +2812,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
return LT.first * Entry->Cost;
}
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -2391,10 +2824,11 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
};
assert(Val->isVectorTy() && "This must be a vector type");
-
Type *ScalarType = Val->getScalarType();
+ int RegisterFileMoveCost = 0;
- if (Index != -1U) {
+ if (Index != -1U && (Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::InsertElement)) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
@@ -2403,17 +2837,32 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return 0;
// The type may be split. Normalize the index to the new type.
- unsigned Width = LT.second.getVectorNumElements();
- Index = Index % Width;
+ unsigned NumElts = LT.second.getVectorNumElements();
+ unsigned SubNumElts = NumElts;
+ Index = Index % NumElts;
+
+ // For >128-bit vectors, we need to extract higher 128-bit subvectors.
+ // For inserts, we also need to insert the subvector back.
+ if (LT.second.getSizeInBits() > 128) {
+ assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
+ unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+ SubNumElts = NumElts / NumSubVecs;
+ if (SubNumElts <= Index) {
+ RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
+ Index %= SubNumElts;
+ }
+ }
if (Index == 0) {
// Floating point scalars are already located in index #0.
+ // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
+ // true for all.
if (ScalarType->isFloatingPointTy())
- return 0;
+ return RegisterFileMoveCost;
- // Assume movd/movq XMM <-> GPR is relatively cheap on all targets.
- if (ScalarType->isIntegerTy())
- return 1;
+ // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
+ if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
+ return 1 + RegisterFileMoveCost;
}
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -2421,24 +2870,124 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
MVT MScalarTy = LT.second.getScalarType();
if (ST->isSLM())
if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost + RegisterFileMoveCost;
+
+ // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+ if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+ (MScalarTy.isInteger() && ST->hasSSE41()))
+ return 1 + RegisterFileMoveCost;
+
+ // Assume insertps is relatively cheap on all targets.
+ if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
+ Opcode == Instruction::InsertElement)
+ return 1 + RegisterFileMoveCost;
+
+ // For extractions we just need to shuffle the element to index 0, which
+ // should be very cheap (assume cost = 1). For insertions we need to shuffle
+ // the elements to its destination. In both cases we must handle the
+ // subvector move(s).
+ // If the vector type is already less than 128-bits then don't reduce it.
+ // TODO: Under what circumstances should we shuffle using the full width?
+ int ShuffleCost = 1;
+ if (Opcode == Instruction::InsertElement) {
+ auto *SubTy = cast<VectorType>(Val);
+ EVT VT = TLI->getValueType(DL, Val);
+ if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
+ SubTy = FixedVectorType::get(ScalarType, SubNumElts);
+ ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+ }
+ int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
+ return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
}
// Add to the base cost if we know that the extracted element of a vector is
// destined to be moved to and used in the integer register file.
- int RegisterFileMoveCost = 0;
if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
- RegisterFileMoveCost = 1;
+ RegisterFileMoveCost += 1;
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}
+unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract) {
+ unsigned Cost = 0;
+
+ // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
+ // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
+ if (Insert) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ MVT MScalarTy = LT.second.getScalarType();
+
+ if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+ (MScalarTy.isInteger() && ST->hasSSE41()) ||
+ (MScalarTy == MVT::f32 && ST->hasSSE41())) {
+ // For types we can insert directly, insertion into 128-bit sub vectors is
+ // cheap, followed by a cheap chain of concatenations.
+ if (LT.second.getSizeInBits() <= 128) {
+ Cost +=
+ BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
+ } else {
+ unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+ Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
+ Cost += DemandedElts.countPopulation();
+
+ // For vXf32 cases, insertion into the 0'th index in each v4f32
+ // 128-bit vector is free.
+ // NOTE: This assumes legalization widens vXf32 vectors.
+ if (MScalarTy == MVT::f32)
+ for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
+ i < e; i += 4)
+ if (DemandedElts[i])
+ Cost--;
+ }
+ } else if (LT.second.isVector()) {
+ // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
+ // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
+ // series of UNPCK followed by CONCAT_VECTORS - all of these can be
+ // considered cheap.
+ if (Ty->isIntOrIntVectorTy())
+ Cost += DemandedElts.countPopulation();
+
+ // Get the smaller of the legalized or original pow2-extended number of
+ // vector elements, which represents the number of unpacks we'll end up
+ // performing.
+ unsigned NumElts = LT.second.getVectorNumElements();
+ unsigned Pow2Elts =
+ PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
+ Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
+ }
+ }
+
+ // TODO: Use default extraction for now, but we should investigate extending this
+ // to handle repeated subvector extraction.
+ if (Extract)
+ Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+
+ return Cost;
+}
+
int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput) {
+ if (isa_and_nonnull<StoreInst>(I)) {
+ Value *Ptr = I->getOperand(1);
+ // Store instruction with index and scale costs 2 Uops.
+ // Check the preceding GEP to identify non-const indices.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+ if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
+ return TTI::TCC_Basic * 2;
+ }
+ }
+ return TTI::TCC_Basic;
+ }
+
// Handle non-power-of-two vectors such as <3 x float>
- if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
- unsigned NumElem = VTy->getVectorNumElements();
+ if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
+ unsigned NumElem = VTy->getNumElements();
// Handle a few common cases:
// <3 x float>
@@ -2453,14 +3002,21 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// Assume that all other non-power-of-two numbers are scalarized.
if (!isPowerOf2_32(NumElem)) {
+ APInt DemandedElts = APInt::getAllOnesValue(NumElem);
int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
- AddressSpace);
- int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+ AddressSpace, CostKind);
+ int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
+ Opcode == Instruction::Load,
Opcode == Instruction::Store);
return NumElem * Cost + SplitCost;
}
}
+ // Type legalization can't handle structs
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -2478,33 +3034,36 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
}
int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
- unsigned Alignment,
- unsigned AddressSpace) {
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
bool IsLoad = (Instruction::Load == Opcode);
bool IsStore = (Instruction::Store == Opcode);
- VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+ auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
if (!SrcVTy)
// To calculate scalar take the regular cost, without mask
- return getMemoryOpCost(Opcode, SrcTy, MaybeAlign(Alignment), AddressSpace);
+ return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
- unsigned NumElem = SrcVTy->getVectorNumElements();
- VectorType *MaskTy =
- VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
- if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) ||
- (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
+ unsigned NumElem = SrcVTy->getNumElements();
+ auto *MaskTy =
+ FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
+ (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
!isPowerOf2_32(NumElem)) {
// Scalarization
- int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+ APInt DemandedElts = APInt::getAllOnesValue(NumElem);
+ int MaskSplitCost =
+ getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
- Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
- int BranchCost = getCFInstrCost(Instruction::Br);
+ Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
+ CostKind);
+ int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
-
- int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
+ int ValueSplitCost =
+ getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace);
+ Alignment, AddressSpace, CostKind);
return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
}
@@ -2519,8 +3078,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
else if (LT.second.getVectorNumElements() > NumElem) {
- VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
- LT.second.getVectorNumElements());
+ auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
+ LT.second.getVectorNumElements());
// Expanding requires fill mask with zeroes
Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
}
@@ -2558,41 +3117,16 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
-int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
- bool IsPairwise) {
+int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ bool IsPairwise,
+ TTI::TargetCostKind CostKind) {
+ // Just use the default implementation for pair reductions.
+ if (IsPairwise)
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
+
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
- static const CostTblEntry SLMCostTblPairWise[] = {
- { ISD::FADD, MVT::v2f64, 3 },
- { ISD::ADD, MVT::v2i64, 5 },
- };
-
- static const CostTblEntry SSE2CostTblPairWise[] = {
- { ISD::FADD, MVT::v2f64, 2 },
- { ISD::FADD, MVT::v4f32, 4 },
- { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
- { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
- { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
- { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
- { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
- { ISD::ADD, MVT::v8i16, 5 },
- { ISD::ADD, MVT::v2i8, 2 },
- { ISD::ADD, MVT::v4i8, 2 },
- { ISD::ADD, MVT::v8i8, 2 },
- { ISD::ADD, MVT::v16i8, 3 },
- };
-
- static const CostTblEntry AVX1CostTblPairWise[] = {
- { ISD::FADD, MVT::v4f64, 5 },
- { ISD::FADD, MVT::v8f32, 7 },
- { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
- { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
- { ISD::ADD, MVT::v8i32, 5 },
- { ISD::ADD, MVT::v16i16, 6 },
- { ISD::ADD, MVT::v32i8, 4 },
- };
-
static const CostTblEntry SLMCostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 3 },
{ ISD::ADD, MVT::v2i64, 5 },
@@ -2633,66 +3167,49 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
EVT VT = TLI->getValueType(DL, ValTy);
if (VT.isSimple()) {
MVT MTy = VT.getSimpleVT();
- if (IsPairwise) {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
- return Entry->Cost;
-
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
- return Entry->Cost;
-
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
- return Entry->Cost;
- } else {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
- return Entry->Cost;
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
- return Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
- return Entry->Cost;
- }
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
}
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
- if (IsPairwise) {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ auto *ValVTy = cast<FixedVectorType>(ValTy);
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ unsigned ArithmeticCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+ ArithmeticCost *= LT.first - 1;
+ }
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
- } else {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
- }
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
// FIXME: These assume a naive kshift+binop lowering, which is probably
// conservative in most cases.
- // FIXME: This doesn't cost large types like v128i1 correctly.
static const CostTblEntry AVX512BoolReduction[] = {
{ ISD::AND, MVT::v2i1, 3 },
{ ISD::AND, MVT::v4i1, 5 },
@@ -2738,252 +3255,408 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
};
// Handle bool allof/anyof patterns.
- if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) {
+ if (ValVTy->getElementType()->isIntegerTy(1)) {
+ unsigned ArithmeticCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+ ArithmeticCost *= LT.first - 1;
+ }
+
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
+
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+ CostKind);
+ }
+
+ unsigned NumVecElts = ValVTy->getNumElements();
+ unsigned ScalarSize = ValVTy->getScalarSizeInBits();
+
+ // Special case power of 2 reductions where the scalar type isn't changed
+ // by type legalization.
+ if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+ CostKind);
+
+ unsigned ReductionCost = 0;
+
+ auto *Ty = ValVTy;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ Ty = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
+ ReductionCost *= LT.first - 1;
+ NumVecElts = MTy.getVectorNumElements();
+ }
+
+ // Now handle reduction with the legal type, taking into account size changes
+ // at each level.
+ while (NumVecElts > 1) {
+ // Determine the size of the remaining vector we need to reduce.
+ unsigned Size = NumVecElts * ScalarSize;
+ NumVecElts /= 2;
+ // If we're reducing from 256/512 bits, use an extract_subvector.
+ if (Size > 128) {
+ auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ Ty = SubTy;
+ } else if (Size == 128) {
+ // Reducing from 128 bits is a permute of v2f64/v2i64.
+ FixedVectorType *ShufTy;
+ if (ValVTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
+ else
+ ShufTy =
+ FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else if (Size == 64) {
+ // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+ FixedVectorType *ShufTy;
+ if (ValVTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
+ else
+ ShufTy =
+ FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else {
+ // Reducing from smaller size is a shift by immediate.
+ auto *ShiftTy = FixedVectorType::get(
+ Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
+ ReductionCost += getArithmeticInstrCost(
+ Instruction::LShr, ShiftTy, CostKind,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_UniformConstantValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ }
+
+ // Add the arithmetic op for this level.
+ ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
}
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+ // Add the final extract element to the cost.
+ return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
-int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
- bool IsPairwise, bool IsUnsigned) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MTy = LT.second;
int ISD;
- if (ValTy->isIntOrIntVectorTy()) {
+ if (Ty->isIntOrIntVectorTy()) {
ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
} else {
- assert(ValTy->isFPOrFPVectorTy() &&
+ assert(Ty->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.");
ISD = ISD::FMINNUM;
}
- // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
- // and make it as the cost.
+ static const CostTblEntry SSE1CostTbl[] = {
+ {ISD::FMINNUM, MVT::v4f32, 1},
+ };
- static const CostTblEntry SSE1CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 4},
- };
-
- static const CostTblEntry SSE2CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v2f64, 3},
- {ISD::SMIN, MVT::v2i64, 6},
- {ISD::UMIN, MVT::v2i64, 8},
- {ISD::SMIN, MVT::v4i32, 6},
- {ISD::UMIN, MVT::v4i32, 8},
- {ISD::SMIN, MVT::v8i16, 4},
- {ISD::UMIN, MVT::v8i16, 6},
- {ISD::SMIN, MVT::v16i8, 8},
- {ISD::UMIN, MVT::v16i8, 6},
- };
-
- static const CostTblEntry SSE41CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 2},
- {ISD::SMIN, MVT::v2i64, 9},
- {ISD::UMIN, MVT::v2i64,10},
- {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
- {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
- {ISD::SMIN, MVT::v8i16, 2},
- {ISD::UMIN, MVT::v8i16, 2},
- {ISD::SMIN, MVT::v16i8, 3},
- {ISD::UMIN, MVT::v16i8, 3},
- };
-
- static const CostTblEntry SSE42CostTblPairWise[] = {
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
- };
-
- static const CostTblEntry AVX1CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 1},
- {ISD::FMINNUM, MVT::v4f64, 1},
- {ISD::FMINNUM, MVT::v8f32, 2},
- {ISD::SMIN, MVT::v2i64, 3},
- {ISD::UMIN, MVT::v2i64, 3},
- {ISD::SMIN, MVT::v4i32, 1},
- {ISD::UMIN, MVT::v4i32, 1},
- {ISD::SMIN, MVT::v8i16, 1},
- {ISD::UMIN, MVT::v8i16, 1},
- {ISD::SMIN, MVT::v16i8, 2},
- {ISD::UMIN, MVT::v16i8, 2},
- {ISD::SMIN, MVT::v4i64, 7},
- {ISD::UMIN, MVT::v4i64, 7},
- {ISD::SMIN, MVT::v8i32, 3},
- {ISD::UMIN, MVT::v8i32, 3},
- {ISD::SMIN, MVT::v16i16, 3},
- {ISD::UMIN, MVT::v16i16, 3},
- {ISD::SMIN, MVT::v32i8, 3},
- {ISD::UMIN, MVT::v32i8, 3},
- };
-
- static const CostTblEntry AVX2CostTblPairWise[] = {
- {ISD::SMIN, MVT::v4i64, 2},
- {ISD::UMIN, MVT::v4i64, 2},
- {ISD::SMIN, MVT::v8i32, 1},
- {ISD::UMIN, MVT::v8i32, 1},
- {ISD::SMIN, MVT::v16i16, 1},
- {ISD::UMIN, MVT::v16i16, 1},
- {ISD::SMIN, MVT::v32i8, 2},
- {ISD::UMIN, MVT::v32i8, 2},
- };
-
- static const CostTblEntry AVX512CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v8f64, 1},
- {ISD::FMINNUM, MVT::v16f32, 2},
- {ISD::SMIN, MVT::v8i64, 2},
- {ISD::UMIN, MVT::v8i64, 2},
- {ISD::SMIN, MVT::v16i32, 1},
- {ISD::UMIN, MVT::v16i32, 1},
- };
-
- static const CostTblEntry SSE1CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 4},
+ static const CostTblEntry SSE2CostTbl[] = {
+ {ISD::FMINNUM, MVT::v2f64, 1},
+ {ISD::SMIN, MVT::v8i16, 1},
+ {ISD::UMIN, MVT::v16i8, 1},
};
- static const CostTblEntry SSE2CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v2f64, 3},
- {ISD::SMIN, MVT::v2i64, 6},
- {ISD::UMIN, MVT::v2i64, 8},
- {ISD::SMIN, MVT::v4i32, 6},
- {ISD::UMIN, MVT::v4i32, 8},
- {ISD::SMIN, MVT::v8i16, 4},
- {ISD::UMIN, MVT::v8i16, 6},
- {ISD::SMIN, MVT::v16i8, 8},
- {ISD::UMIN, MVT::v16i8, 6},
+ static const CostTblEntry SSE41CostTbl[] = {
+ {ISD::SMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v16i8, 1},
};
- static const CostTblEntry SSE41CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 3},
- {ISD::SMIN, MVT::v2i64, 9},
- {ISD::UMIN, MVT::v2i64,11},
- {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
- {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
- {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
- {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
- {ISD::SMIN, MVT::v16i8, 3},
- {ISD::UMIN, MVT::v16i8, 3},
+ static const CostTblEntry SSE42CostTbl[] = {
+ {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
};
- static const CostTblEntry SSE42CostTblNoPairWise[] = {
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
+ static const CostTblEntry AVX1CostTbl[] = {
+ {ISD::FMINNUM, MVT::v8f32, 1},
+ {ISD::FMINNUM, MVT::v4f64, 1},
+ {ISD::SMIN, MVT::v8i32, 3},
+ {ISD::UMIN, MVT::v8i32, 3},
+ {ISD::SMIN, MVT::v16i16, 3},
+ {ISD::UMIN, MVT::v16i16, 3},
+ {ISD::SMIN, MVT::v32i8, 3},
+ {ISD::UMIN, MVT::v32i8, 3},
};
- static const CostTblEntry AVX1CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 1},
- {ISD::FMINNUM, MVT::v4f64, 1},
- {ISD::FMINNUM, MVT::v8f32, 1},
- {ISD::SMIN, MVT::v2i64, 3},
- {ISD::UMIN, MVT::v2i64, 3},
- {ISD::SMIN, MVT::v4i32, 1},
- {ISD::UMIN, MVT::v4i32, 1},
- {ISD::SMIN, MVT::v8i16, 1},
- {ISD::UMIN, MVT::v8i16, 1},
- {ISD::SMIN, MVT::v16i8, 2},
- {ISD::UMIN, MVT::v16i8, 2},
- {ISD::SMIN, MVT::v4i64, 7},
- {ISD::UMIN, MVT::v4i64, 7},
- {ISD::SMIN, MVT::v8i32, 2},
- {ISD::UMIN, MVT::v8i32, 2},
- {ISD::SMIN, MVT::v16i16, 2},
- {ISD::UMIN, MVT::v16i16, 2},
- {ISD::SMIN, MVT::v32i8, 2},
- {ISD::UMIN, MVT::v32i8, 2},
- };
-
- static const CostTblEntry AVX2CostTblNoPairWise[] = {
- {ISD::SMIN, MVT::v4i64, 1},
- {ISD::UMIN, MVT::v4i64, 1},
- {ISD::SMIN, MVT::v8i32, 1},
- {ISD::UMIN, MVT::v8i32, 1},
- {ISD::SMIN, MVT::v16i16, 1},
- {ISD::UMIN, MVT::v16i16, 1},
- {ISD::SMIN, MVT::v32i8, 1},
- {ISD::UMIN, MVT::v32i8, 1},
- };
-
- static const CostTblEntry AVX512CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v8f64, 1},
- {ISD::FMINNUM, MVT::v16f32, 2},
- {ISD::SMIN, MVT::v8i64, 1},
- {ISD::UMIN, MVT::v8i64, 1},
- {ISD::SMIN, MVT::v16i32, 1},
- {ISD::UMIN, MVT::v16i32, 1},
- };
-
- if (IsPairwise) {
- if (ST->hasAVX512())
- if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX2CostTbl[] = {
+ {ISD::SMIN, MVT::v8i32, 1},
+ {ISD::UMIN, MVT::v8i32, 1},
+ {ISD::SMIN, MVT::v16i16, 1},
+ {ISD::UMIN, MVT::v16i16, 1},
+ {ISD::SMIN, MVT::v32i8, 1},
+ {ISD::UMIN, MVT::v32i8, 1},
+ };
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX512CostTbl[] = {
+ {ISD::FMINNUM, MVT::v16f32, 1},
+ {ISD::FMINNUM, MVT::v8f64, 1},
+ {ISD::SMIN, MVT::v2i64, 1},
+ {ISD::UMIN, MVT::v2i64, 1},
+ {ISD::SMIN, MVT::v4i64, 1},
+ {ISD::UMIN, MVT::v4i64, 1},
+ {ISD::SMIN, MVT::v8i64, 1},
+ {ISD::UMIN, MVT::v8i64, 1},
+ {ISD::SMIN, MVT::v16i32, 1},
+ {ISD::UMIN, MVT::v16i32, 1},
+ };
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ {ISD::SMIN, MVT::v32i16, 1},
+ {ISD::UMIN, MVT::v32i16, 1},
+ {ISD::SMIN, MVT::v64i8, 1},
+ {ISD::UMIN, MVT::v64i8, 1},
+ };
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ // If we have a native MIN/MAX instruction for this type, use it.
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE41())
- if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE1())
- if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ unsigned CmpOpcode;
+ if (Ty->isFPOrFPVectorTy()) {
+ CmpOpcode = Instruction::FCmp;
} else {
- if (ST->hasAVX512())
- if (const auto *Entry =
- CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ assert(Ty->isIntOrIntVectorTy() &&
+ "expecting floating point or integer type for min/max reduction");
+ CmpOpcode = Instruction::ICmp;
+ }
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Otherwise fall back to cmp+select.
+ return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
+ getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind);
+}
+
+int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
+ bool IsPairwise, bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
+ // Just use the default implementation for pair reductions.
+ if (IsPairwise)
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD;
+ if (ValTy->isIntOrIntVectorTy()) {
+ ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+ } else {
+ assert(ValTy->isFPOrFPVectorTy() &&
+ "Expected float point or integer vector type.");
+ ISD = ISD::FMINNUM;
+ }
+
+ // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+ // and make it as the cost.
+
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
+ {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
+ {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
+ {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
+ };
+
+ static const CostTblEntry SSE41CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
+ {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
+ {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
+ {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
+ {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
+ {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v2i8, 3}, // pminsb
+ {ISD::SMIN, MVT::v4i8, 5}, // pminsb
+ {ISD::SMIN, MVT::v8i8, 7}, // pminsb
+ {ISD::SMIN, MVT::v16i8, 6},
+ {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
+ {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
+ {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
+ {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
+ };
+
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v16i16, 6},
+ {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v32i8, 8},
+ {ISD::UMIN, MVT::v32i8, 8},
+ };
+
+ static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v32i16, 8},
+ {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v64i8, 10},
+ {ISD::UMIN, MVT::v64i8, 10},
+ };
+
+ // Before legalizing the type, give a chance to look up illegal narrow types
+ // in the table.
+ // FIXME: Is there a better way to do this?
+ EVT VT = TLI->getValueType(DL, ValTy);
+ if (VT.isSimple()) {
+ MVT MTy = VT.getSimpleVT();
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
-
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost;
if (ST->hasSSE41())
if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost;
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost;
+ }
- if (ST->hasSSE1())
- if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ auto *ValVTy = cast<FixedVectorType>(ValTy);
+ unsigned NumVecElts = ValVTy->getNumElements();
+
+ auto *Ty = ValVTy;
+ unsigned MinMaxCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 operations ops.
+ Ty = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
+ MTy.getVectorNumElements());
+ MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+ MinMaxCost *= LT.first - 1;
+ NumVecElts = MTy.getVectorNumElements();
}
- return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ unsigned ScalarSize = ValTy->getScalarSizeInBits();
+
+ // Special case power of 2 reductions where the scalar type isn't changed
+ // by type legalization.
+ if (!isPowerOf2_32(ValVTy->getNumElements()) ||
+ ScalarSize != MTy.getScalarSizeInBits())
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
+
+ // Now handle reduction with the legal type, taking into account size changes
+ // at each level.
+ while (NumVecElts > 1) {
+ // Determine the size of the remaining vector we need to reduce.
+ unsigned Size = NumVecElts * ScalarSize;
+ NumVecElts /= 2;
+ // If we're reducing from 256/512 bits, use an extract_subvector.
+ if (Size > 128) {
+ auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ Ty = SubTy;
+ } else if (Size == 128) {
+ // Reducing from 128 bits is a permute of v2f64/v2i64.
+ VectorType *ShufTy;
+ if (ValTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
+ else
+ ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else if (Size == 64) {
+ // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+ FixedVectorType *ShufTy;
+ if (ValTy->isFloatingPointTy())
+ ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
+ else
+ ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else {
+ // Reducing from smaller size is a shift by immediate.
+ auto *ShiftTy = FixedVectorType::get(
+ Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
+ MinMaxCost += getArithmeticInstrCost(
+ Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_UniformConstantValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ }
+
+ // Add the arithmetic op for this level.
+ auto *SubCondTy =
+ FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
+ MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+ }
+
+ // Add the final extract element to the cost.
+ return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
/// Calculate the cost of materializing a 64-bit value. This helper
@@ -2999,7 +3672,8 @@ int X86TTIImpl::getIntImmCost(int64_t Val) {
return 2 * TTI::TCC_Basic;
}
-int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3034,7 +3708,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
}
int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty) {
+ Type *Ty, TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3121,17 +3795,18 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
if (Idx == ImmIdx) {
int NumConstants = divideCeil(BitSize, 64);
- int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+ int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
}
- return X86TTIImpl::getIntImmCost(Imm, Ty);
+ return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3162,52 +3837,45 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
return TTI::TCC_Free;
break;
}
- return X86TTIImpl::getIntImmCost(Imm, Ty);
+ return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-unsigned X86TTIImpl::getUserCost(const User *U,
- ArrayRef<const Value *> Operands) {
- if (isa<StoreInst>(U)) {
- Value *Ptr = U->getOperand(1);
- // Store instruction with index and scale costs 2 Uops.
- // Check the preceding GEP to identify non-const indices.
- if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
- if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
- return TTI::TCC_Basic * 2;
- }
- return TTI::TCC_Basic;
- }
- return BaseT::getUserCost(U, Operands);
+unsigned
+X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Opcode == Instruction::PHI ? 0 : 1;
+ // Branches are assumed to be predicted.
+ return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
}
// Return an average cost of Gather / Scatter instruction, maybe improved later
-int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
- unsigned Alignment, unsigned AddressSpace) {
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
+ Align Alignment, unsigned AddressSpace) {
assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
- unsigned VF = SrcVTy->getVectorNumElements();
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
// Try to reduce index size from 64 bit (default for GEP)
// to 32. It is essential for VF 16. If the index can't be reduced to 32, the
// operation will use 16 x 64 indices which do not fit in a zmm and needs
// to split. Also check that the base pointer is the same for all lanes,
// and that there's at most one variable index.
- auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+ auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
unsigned IndexSize = DL.getPointerSizeInBits();
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (IndexSize < 64 || !GEP)
return IndexSize;
unsigned NumOfVarIndices = 0;
- Value *Ptrs = GEP->getPointerOperand();
+ const Value *Ptrs = GEP->getPointerOperand();
if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
return IndexSize;
for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
if (isa<Constant>(GEP->getOperand(i)))
continue;
Type *IndxTy = GEP->getOperand(i)->getType();
- if (IndxTy->isVectorTy())
- IndxTy = IndxTy->getVectorElementType();
+ if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
+ IndxTy = IndexVTy->getElementType();
if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
!isa<SExtInst>(GEP->getOperand(i))) ||
++NumOfVarIndices > 1)
@@ -3216,21 +3884,21 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
return (unsigned)32;
};
-
// Trying to reduce IndexSize to 32 bits for vector 16.
// By default the IndexSize is equal to pointer size.
unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
? getIndexSizeInBits(Ptr, DL)
: DL.getPointerSizeInBits();
- Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
- IndexSize), VF);
+ auto *IndexVTy = FixedVectorType::get(
+ IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
if (SplitFactor > 1) {
// Handle splitting of vector of pointers
- Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+ auto *SplitSrcTy =
+ FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
AddressSpace);
}
@@ -3241,7 +3909,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
? ST->getGatherOverhead()
: ST->getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ TTI::TCK_RecipThroughput);
}
/// Return the cost of full scalarization of gather / scatter operation.
@@ -3253,25 +3922,29 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
/// AddressSpace - pointer[s] address space.
///
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
- bool VariableMask, unsigned Alignment,
+ bool VariableMask, Align Alignment,
unsigned AddressSpace) {
- unsigned VF = SrcVTy->getVectorNumElements();
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(VF);
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
int MaskUnpackCost = 0;
if (VariableMask) {
- VectorType *MaskTy =
- VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
- MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+ auto *MaskTy =
+ FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
+ MaskUnpackCost =
+ getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost =
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
- nullptr);
- int BranchCost = getCFInstrCost(Instruction::Br);
+ nullptr, CostKind);
+ int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}
// The cost of the scalar loads/stores.
int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
int InsertExtractCost = 0;
if (Opcode == Instruction::Load)
@@ -3290,21 +3963,28 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
/// Calculate the cost of Gather / Scatter operation
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
- Value *Ptr, bool VariableMask,
- unsigned Alignment) {
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) {
+
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return 1;
+
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
- unsigned VF = SrcVTy->getVectorNumElements();
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy && Ptr->getType()->isVectorTy())
- PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+ PtrTy = dyn_cast<PointerType>(
+ cast<VectorType>(Ptr->getType())->getElementType());
assert(PtrTy && "Unexpected type for Ptr argument");
unsigned AddressSpace = PtrTy->getAddressSpace();
bool Scalarize = false;
if ((Opcode == Instruction::Load &&
- !isLegalMaskedGather(SrcVTy, MaybeAlign(Alignment))) ||
+ !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
(Opcode == Instruction::Store &&
- !isLegalMaskedScatter(SrcVTy, MaybeAlign(Alignment))))
+ !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
Scalarize = true;
// Gather / Scatter for vector 2 is not profitable on KNL / SKX
// Vector-4 of gather/scatter instruction does not exist on KNL.
@@ -3337,12 +4017,13 @@ bool X86TTIImpl::canMacroFuseCmp() {
return ST->hasMacroFusion() || ST->hasBranchFusion();
}
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (!ST->hasAVX())
return false;
// The backend can't handle a single element vector.
- if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
+ if (isa<VectorType>(DataTy) &&
+ cast<FixedVectorType>(DataTy)->getNumElements() == 1)
return false;
Type *ScalarTy = DataTy->getScalarType();
@@ -3360,7 +4041,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
}
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
return isLegalMaskedLoad(DataType, Alignment);
}
@@ -3407,10 +4088,10 @@ bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
return false;
// The backend can't handle a single element vector.
- if (DataTy->getVectorNumElements() == 1)
+ if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
return false;
- Type *ScalarTy = DataTy->getVectorElementType();
+ Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
return true;
@@ -3427,7 +4108,7 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
return isLegalMaskedExpandLoad(DataTy);
}
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
// Some CPUs have better gather performance than others.
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
// enable gather with a -march.
@@ -3446,8 +4127,8 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
// In this case we can reject non-power-of-2 vectors.
// We also reject single element vectors as the type legalizer can't
// scalarize it.
- if (isa<VectorType>(DataTy)) {
- unsigned NumElts = DataTy->getVectorNumElements();
+ if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
+ unsigned NumElts = DataVTy->getNumElements();
if (NumElts == 1 || !isPowerOf2_32(NumElts))
return false;
}
@@ -3465,7 +4146,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
return IntWidth == 32 || IntWidth == 64;
}
-bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
// AVX2 doesn't support scatter
if (!ST->hasAVX512())
return false;
@@ -3505,11 +4186,22 @@ bool X86TTIImpl::areFunctionArgsABICompatible(
// If we get here, we know the target features match. If one function
// considers 512-bit vectors legal and the other does not, consider them
// incompatible.
- // FIXME Look at the arguments and only consider 512 bit or larger vectors?
const TargetMachine &TM = getTLI()->getTargetMachine();
- return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
- TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
+ if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
+ TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
+ return true;
+
+ // Consider the arguments compatible if they aren't vectors or aggregates.
+ // FIXME: Look at the size of vectors.
+ // FIXME: Look at the element types of aggregates to see if there are vectors.
+ // FIXME: The API of this function seems intended to allow arguments
+ // to be removed from the set, but the caller doesn't check if the set
+ // becomes empty so that may not work in practice.
+ return llvm::none_of(Args, [](Argument *A) {
+ auto *EltTy = cast<PointerType>(A->getType())->getElementType();
+ return EltTy->isVectorTy() || EltTy->isAggregateType();
+ });
}
X86TTIImpl::TTI::MemCmpExpansionOptions
@@ -3517,6 +4209,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = 2;
+ // All GPR and vector loads can be unaligned.
+ Options.AllowOverlappingLoads = true;
if (IsZeroCmp) {
// Only enable vector loads for equality comparison. Right now the vector
// version is not as fast for three way compare (see #33329).
@@ -3524,8 +4218,6 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
- // All GPR and vector loads can be unaligned.
- Options.AllowOverlappingLoads = true;
}
if (ST->is64Bit()) {
Options.LoadSizes.push_back(8);
@@ -3555,24 +4247,22 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
// computing the cost using a generic formula as a function of generic
// shuffles. We therefore use a lookup table instead, filled according to
// the instruction sequences that codegen currently generates.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
// We currently Support only fully-interleaved groups, with no gaps.
// TODO: Support also strided loads (interleaved-groups with gaps).
if (Indices.size() && Indices.size() != Factor)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ CostKind);
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -3584,10 +4274,11 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
// (see MachineValueType.h::getVectorVT()).
if (!LegalVT.isVector())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ CostKind);
- unsigned VF = VecTy->getVectorNumElements() / Factor;
- Type *ScalarTy = VecTy->getVectorElementType();
+ unsigned VF = VecTy->getNumElements() / Factor;
+ Type *ScalarTy = VecTy->getElementType();
// Calculate the number of memory operations (NumOfMemOps), required
// for load/store the VecTy.
@@ -3596,16 +4287,18 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Get the cost of one memory operation.
- Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
- LegalVT.getVectorNumElements());
+ auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+ LegalVT.getVectorNumElements());
unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
- VectorType *VT = VectorType::get(ScalarTy, VF);
+ auto *VT = FixedVectorType::get(ScalarTy, VF);
EVT ETy = TLI->getValueType(DL, VT);
if (!ETy.isSimple())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ CostKind);
// TODO: Complete for other data-types and strides.
// Each combination of Stride, ElementTy and VF results in a different
@@ -3664,24 +4357,21 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, CostKind);
}
// Get estimation for interleaved load/store operations and strided load.
// \p Indices contains indices for strided load.
// \p Factor - the factor of interleaving.
// AVX-512 provides 3-src shuffles that significantly reduces the cost.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
// VecTy for interleave memop is <VF*Factor x Elt>.
@@ -3696,12 +4386,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Get the cost of one memory operation.
- Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
- LegalVT.getVectorNumElements());
+ auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+ LegalVT.getVectorNumElements());
unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
- unsigned VF = VecTy->getVectorNumElements() / Factor;
+ unsigned VF = VecTy->getNumElements() / Factor;
MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
if (Opcode == Instruction::Load) {
@@ -3733,8 +4424,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned NumOfLoadsInInterleaveGrp =
Indices.size() ? Indices.size() : Factor;
- Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
- VecTy->getVectorNumElements() / Factor);
+ auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
+ VecTy->getNumElements() / Factor);
unsigned NumOfResults =
getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
NumOfLoadsInInterleaveGrp;
@@ -3796,15 +4487,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
return Cost;
}
-int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
- Type *EltTy = VecTy->getVectorElementType();
+ Type *EltTy = cast<VectorType>(VecTy)->getElementType();
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
EltTy->isIntegerTy(32) || EltTy->isPointerTy())
return true;
@@ -3813,15 +4501,15 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
return false;
};
if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
- return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- UseMaskForCond, UseMaskForGaps);
+ return getInterleavedMemoryOpCostAVX512(
+ Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+ AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
if (ST->hasAVX2())
- return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- UseMaskForCond, UseMaskForGaps);
+ return getInterleavedMemoryOpCostAVX2(
+ Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+ AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
index b9c2dbd78058..d462e1f96ca2 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -16,11 +16,9 @@
#ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
-#include "X86.h"
#include "X86TargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
@@ -107,9 +105,9 @@ public:
/// \name Cache TTI Implementation
/// @{
llvm::Optional<unsigned> getCacheSize(
- TargetTransformInfo::CacheLevel Level) const;
+ TargetTransformInfo::CacheLevel Level) const override;
llvm::Optional<unsigned> getCacheAssociativity(
- TargetTransformInfo::CacheLevel Level) const;
+ TargetTransformInfo::CacheLevel Level) const override;
/// @}
/// \name Vector TTI Implementations
@@ -121,76 +119,90 @@ public:
unsigned getMaxInterleaveFactor(unsigned VF);
int getArithmeticInstrCost(
unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
- int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+ VectorType *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+ unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
+ bool Insert, bool Extract);
int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
- unsigned AddressSpace, const Instruction *I = nullptr);
- int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
- int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
- bool VariableMask, unsigned Alignment);
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr);
+ int getMaskedMemoryOpCost(
+ unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+ int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
+ bool VariableMask, Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I);
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
const SCEV *Ptr);
unsigned getAtomicMemIntrinsicMaxElementSize() const;
- int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF,
- unsigned ScalarizationCostPassed = UINT_MAX);
- int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF,
- unsigned VF = 1);
-
- int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
- bool IsPairwiseForm);
-
- int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
- bool IsUnsigned);
-
- int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace,
- bool UseMaskForCond = false,
- bool UseMaskForGaps = false);
- int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
- unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace,
- bool UseMaskForCond = false,
- bool UseMaskForGaps = false);
- int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
- unsigned Factor, ArrayRef<unsigned> Indices,
- unsigned Alignment, unsigned AddressSpace,
- bool UseMaskForCond = false,
- bool UseMaskForGaps = false);
+ int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+ int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind);
+
+ int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+ bool IsPairwiseForm,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+
+ int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
+
+ int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+ bool IsPairwiseForm, bool IsUnsigned,
+ TTI::TargetCostKind CostKind);
+
+ int getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
+ int getInterleavedMemoryOpCostAVX512(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
+ int getInterleavedMemoryOpCostAVX2(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+ bool UseMaskForCond = false, bool UseMaskForGaps = false);
int getIntImmCost(int64_t);
- int getIntImmCost(const APInt &Imm, Type *Ty);
+ int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
- unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+ unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
- int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind);
int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ Type *Ty, TTI::TargetCostKind CostKind);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool canMacroFuseCmp();
- bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment);
- bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment);
+ bool isLegalMaskedLoad(Type *DataType, Align Alignment);
+ bool isLegalMaskedStore(Type *DataType, Align Alignment);
bool isLegalNTLoad(Type *DataType, Align Alignment);
bool isLegalNTStore(Type *DataType, Align Alignment);
- bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment);
- bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment);
+ bool isLegalMaskedGather(Type *DataType, Align Alignment);
+ bool isLegalMaskedScatter(Type *DataType, Align Alignment);
bool isLegalMaskedExpandLoad(Type *DataType);
bool isLegalMaskedCompressStore(Type *DataType);
bool hasDivRemOp(Type *DataType, bool IsSigned);
@@ -203,11 +215,20 @@ public:
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
+
+ /// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded
+ /// into shuffles and vector math/logic by the backend
+ /// (see TTI::shouldExpandReduction)
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ return true;
+ }
+
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
- unsigned Alignment, unsigned AddressSpace);
- int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
- unsigned Alignment, unsigned AddressSpace);
+ Align Alignment, unsigned AddressSpace);
+ int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
+ Align Alignment, unsigned AddressSpace);
/// @}
};
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
index 7a8308ef1ba9..c188c7443625 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -39,6 +39,11 @@ using namespace llvm;
#define DEBUG_TYPE "x86-vzeroupper"
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+ cl::desc("Minimize AVX to SSE transition penalty"),
+ cl::init(true));
+
STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
namespace {
@@ -278,6 +283,9 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
/// Loop over all of the basic blocks, inserting vzeroupper instructions before
/// function calls.
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+ if (!UseVZeroUpper)
+ return false;
+
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
if (!ST.hasAVX() || !ST.insertVZEROUPPER())
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
index 42e8fba2201e..72593afb2258 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -19,6 +19,7 @@
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
index 78d3f6460189..8627bbbf18d2 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -19,7 +19,7 @@
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
@@ -67,13 +67,13 @@ private:
Function *generateLSDAInEAXThunk(Function *ParentFunc);
- bool isStateStoreNeeded(EHPersonality Personality, CallSite CS);
- void rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, CallSite CS,
- Value *State);
+ bool isStateStoreNeeded(EHPersonality Personality, CallBase &Call);
+ void rewriteSetJmpCall(IRBuilder<> &Builder, Function &F, CallBase &Call,
+ Value *State);
int getBaseStateForBB(DenseMap<BasicBlock *, ColorVector> &BlockColors,
WinEHFuncInfo &FuncInfo, BasicBlock *BB);
- int getStateForCallSite(DenseMap<BasicBlock *, ColorVector> &BlockColors,
- WinEHFuncInfo &FuncInfo, CallSite CS);
+ int getStateForCall(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ WinEHFuncInfo &FuncInfo, CallBase &Call);
// Module-level type getters.
Type *getEHLinkRegistrationType();
@@ -455,16 +455,14 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
// The idea behind _setjmp3 is that it takes an optional number of personality
// specific parameters to indicate how to restore the personality-specific frame
// state when longjmp is initiated. Typically, the current TryLevel is saved.
-void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
- CallSite CS, Value *State) {
+void WinEHStatePass::rewriteSetJmpCall(IRBuilder<> &Builder, Function &F,
+ CallBase &Call, Value *State) {
// Don't rewrite calls with a weird number of arguments.
- if (CS.getNumArgOperands() != 2)
+ if (Call.getNumArgOperands() != 2)
return;
- Instruction *Inst = CS.getInstruction();
-
SmallVector<OperandBundleDef, 1> OpBundles;
- CS.getOperandBundlesAsDefs(OpBundles);
+ Call.getOperandBundlesAsDefs(OpBundles);
SmallVector<Value *, 3> OptionalArgs;
if (Personality == EHPersonality::MSVC_CXX) {
@@ -482,29 +480,27 @@ void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
SmallVector<Value *, 5> Args;
Args.push_back(
- Builder.CreateBitCast(CS.getArgOperand(0), Builder.getInt8PtrTy()));
+ Builder.CreateBitCast(Call.getArgOperand(0), Builder.getInt8PtrTy()));
Args.push_back(Builder.getInt32(OptionalArgs.size()));
Args.append(OptionalArgs.begin(), OptionalArgs.end());
- CallSite NewCS;
- if (CS.isCall()) {
- auto *CI = cast<CallInst>(Inst);
+ CallBase *NewCall;
+ if (auto *CI = dyn_cast<CallInst>(&Call)) {
CallInst *NewCI = Builder.CreateCall(SetJmp3, Args, OpBundles);
NewCI->setTailCallKind(CI->getTailCallKind());
- NewCS = NewCI;
+ NewCall = NewCI;
} else {
- auto *II = cast<InvokeInst>(Inst);
- NewCS = Builder.CreateInvoke(
+ auto *II = cast<InvokeInst>(&Call);
+ NewCall = Builder.CreateInvoke(
SetJmp3, II->getNormalDest(), II->getUnwindDest(), Args, OpBundles);
}
- NewCS.setCallingConv(CS.getCallingConv());
- NewCS.setAttributes(CS.getAttributes());
- NewCS->setDebugLoc(CS->getDebugLoc());
-
- Instruction *NewInst = NewCS.getInstruction();
- NewInst->takeName(Inst);
- Inst->replaceAllUsesWith(NewInst);
- Inst->eraseFromParent();
+ NewCall->setCallingConv(Call.getCallingConv());
+ NewCall->setAttributes(Call.getAttributes());
+ NewCall->setDebugLoc(Call.getDebugLoc());
+
+ NewCall->takeName(&Call);
+ Call.replaceAllUsesWith(NewCall);
+ Call.eraseFromParent();
}
// Figure out what state we should assign calls in this block.
@@ -527,17 +523,17 @@ int WinEHStatePass::getBaseStateForBB(
}
// Calculate the state a call-site is in.
-int WinEHStatePass::getStateForCallSite(
+int WinEHStatePass::getStateForCall(
DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
- CallSite CS) {
- if (auto *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+ CallBase &Call) {
+ if (auto *II = dyn_cast<InvokeInst>(&Call)) {
// Look up the state number of the EH pad this unwinds to.
assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
return FuncInfo.InvokeStateMap[II];
}
// Possibly throwing call instructions have no actions to take after
// an unwind. Ensure they are in the -1 state.
- return getBaseStateForBB(BlockColors, FuncInfo, CS.getParent());
+ return getBaseStateForBB(BlockColors, FuncInfo, Call.getParent());
}
// Calculate the intersection of all the FinalStates for a BasicBlock's
@@ -618,16 +614,13 @@ static int getSuccState(DenseMap<BasicBlock *, int> &InitialStates, Function &F,
}
bool WinEHStatePass::isStateStoreNeeded(EHPersonality Personality,
- CallSite CS) {
- if (!CS)
- return false;
-
+ CallBase &Call) {
// If the function touches memory, it needs a state store.
if (isAsynchronousEHPersonality(Personality))
- return !CS.doesNotAccessMemory();
+ return !Call.doesNotAccessMemory();
// If the function throws, it needs a state store.
- return !CS.doesNotThrow();
+ return !Call.doesNotThrow();
}
void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
@@ -672,11 +665,11 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
if (&F.getEntryBlock() == BB)
InitialState = FinalState = ParentBaseState;
for (Instruction &I : *BB) {
- CallSite CS(&I);
- if (!isStateStoreNeeded(Personality, CS))
+ auto *Call = dyn_cast<CallBase>(&I);
+ if (!Call || !isStateStoreNeeded(Personality, *Call))
continue;
- int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+ int State = getStateForCall(BlockColors, FuncInfo, *Call);
if (InitialState == OverdefinedState)
InitialState = State;
FinalState = State;
@@ -739,11 +732,11 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
<< " PrevState=" << PrevState << '\n');
for (Instruction &I : *BB) {
- CallSite CS(&I);
- if (!isStateStoreNeeded(Personality, CS))
+ auto *Call = dyn_cast<CallBase>(&I);
+ if (!Call || !isStateStoreNeeded(Personality, *Call))
continue;
- int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+ int State = getStateForCall(BlockColors, FuncInfo, *Call);
if (State != PrevState)
insertStateNumberStore(&I, State);
PrevState = State;
@@ -756,35 +749,35 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
insertStateNumberStore(BB->getTerminator(), EndState->second);
}
- SmallVector<CallSite, 1> SetJmp3CallSites;
+ SmallVector<CallBase *, 1> SetJmp3Calls;
for (BasicBlock *BB : RPOT) {
for (Instruction &I : *BB) {
- CallSite CS(&I);
- if (!CS)
+ auto *Call = dyn_cast<CallBase>(&I);
+ if (!Call)
continue;
- if (CS.getCalledValue()->stripPointerCasts() !=
+ if (Call->getCalledOperand()->stripPointerCasts() !=
SetJmp3.getCallee()->stripPointerCasts())
continue;
- SetJmp3CallSites.push_back(CS);
+ SetJmp3Calls.push_back(Call);
}
}
- for (CallSite CS : SetJmp3CallSites) {
- auto &BBColors = BlockColors[CS->getParent()];
+ for (CallBase *Call : SetJmp3Calls) {
+ auto &BBColors = BlockColors[Call->getParent()];
BasicBlock *FuncletEntryBB = BBColors.front();
bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI());
- IRBuilder<> Builder(CS.getInstruction());
+ IRBuilder<> Builder(Call);
Value *State;
if (InCleanup) {
Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
RegNode, StateFieldIndex);
State = Builder.CreateLoad(Builder.getInt32Ty(), StateField);
} else {
- State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS));
+ State = Builder.getInt32(getStateForCall(BlockColors, FuncInfo, *Call));
}
- rewriteSetJmpCallSite(Builder, F, CS, State);
+ rewriteSetJmpCall(Builder, F, *Call, State);
}
}