diff options
Diffstat (limited to 'lib/Target/X86/X86InstrInfo.cpp')
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 1514 |
1 files changed, 1063 insertions, 451 deletions
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index cf68ef053361..63e78de69bc9 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" @@ -101,9 +102,11 @@ struct X86MemoryFoldTableEntry { void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) - : X86GenInstrInfo( - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 + : X86::ADJCALLSTACKDOWN32), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 + : X86::ADJCALLSTACKUP32), + X86::CATCHRET), Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { @@ -332,6 +335,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, + { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, + { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, + { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, @@ -495,7 +501,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, @@ -605,7 +610,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, @@ -1647,6 +1651,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PEXT32rr, X86::PEXT32rm, 0 }, { X86::PEXT64rr, X86::PEXT64rm, 0 }, + // ADX foldable instructions + { X86::ADCX32rr, X86::ADCX32rm, 0 }, + { X86::ADCX64rr, X86::ADCX64rm, 0 }, + { X86::ADOX32rr, X86::ADOX32rm, 0 }, + { X86::ADOX64rr, X86::ADOX64rm, 0 }, + // AVX-512 foldable instructions { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, @@ -1729,11 +1739,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, @@ -1749,11 +1765,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, @@ -1769,11 +1791,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, @@ -1789,11 +1817,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, @@ -2282,7 +2316,35 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::FsVMOVAPSrm: case X86::FsVMOVAPDrm: case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: { + case X86::FsMOVAPDrm: + // AVX-512 + case X86::VMOVAPDZ128rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVAPDZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZrm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQU16Z256rm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVDQU64Zrm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU8Zrm: + case X86::VMOVUPSZ128rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI->getOperand(1+X86::AddrBaseReg).isReg() && MI->getOperand(1+X86::AddrScaleAmt).isImm() && @@ -2363,9 +2425,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, // It is safe to clobber EFLAGS at the end of a block of no successor has it // live in. if (Iter == E) { - for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), - SE = MBB.succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *S : MBB.successors()) + if (S->isLiveIn(X86::EFLAGS)) return false; return true; } @@ -2411,13 +2472,29 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { - // MOV32r0 is implemented with a xor which clobbers condition code. - // Re-materialize it as movri instructions to avoid side effects. - unsigned Opc = Orig->getOpcode(); - if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { + bool ClobbersEFLAGS = false; + for (const MachineOperand &MO : Orig->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + ClobbersEFLAGS = true; + break; + } + } + + if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { + // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side + // effects. + int Value; + switch (Orig->getOpcode()) { + case X86::MOV32r0: Value = 0; break; + case X86::MOV32r1: Value = 1; break; + case X86::MOV32r_1: Value = -1; break; + default: + llvm_unreachable("Unexpected instruction!"); + } + DebugLoc DL = Orig->getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) - .addImm(0); + .addImm(Value); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); @@ -2428,7 +2505,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, } /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. -static bool hasLiveCondCodeDef(MachineInstr *MI) { +bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isDef() && @@ -2453,7 +2530,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI, inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address // instructions if we can encode them appropriately. - // A LEA instruction utilizes a SIB byte to encode it's scale factor. + // A LEA instruction utilizes a SIB byte to encode its scale factor. // The SIB.scale field is two bits wide which means that we can encode any // shift amount less than 4. return ShAmt < 4 && ShAmt > 0; @@ -2493,7 +2570,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, ImplicitOp = Src; ImplicitOp.setImplicit(); - NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64); + NewSrc = getX86SubSuperRegister(Src.getReg(), 64); MachineBasicBlock::LivenessQueryResult LQR = MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); @@ -2914,10 +2991,162 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// We have a few instructions that must be hacked on to commute them. -/// -MachineInstr * -X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +/// Returns true if the given instruction opcode is FMA3. +/// Otherwise, returns false. +/// The second parameter is optional and is used as the second return from +/// the function. It is set to true if the given instruction has FMA3 opcode +/// that is used for lowering of scalar FMA intrinsics, and it is set to false +/// otherwise. +static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { + if (IsIntrinsic) + *IsIntrinsic = false; + + switch (Opcode) { + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + return true; + + case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: + case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: + case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: + case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: + case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: + case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: + + case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: + case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: + case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: + case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: + case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: + case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: + + case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: + case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: + case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: + case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: + case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: + case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: + if (IsIntrinsic) + *IsIntrinsic = true; + return true; + default: + return false; + } + llvm_unreachable("Opcode not handled by the switch"); +} + +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { switch (MI->getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) @@ -2944,7 +3173,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } MI->setDesc(get(Opc)); MI->getOperand(3).setImm(Size-Amt); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: @@ -2980,7 +3209,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ @@ -2995,7 +3224,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMPPDrri: case X86::CMPPSrri: @@ -3016,7 +3245,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI = MF.CloneMachineInstr(MI); NewMI = false; } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); default: return nullptr; } @@ -3045,7 +3274,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: @@ -3124,11 +3353,272 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Fallthrough intended. } default: - return TargetInstrInfo::commuteInstruction(MI, NewMI); + if (isFMA3(MI->getOpcode())) { + unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + if (Opc == 0) + return nullptr; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } +} + +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + + unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + + // Only the first RegOpsNum operands are commutable. + // Also, the value 'CommuteAnyOperandIndex' is valid here as it means + // that the operand is not specified/fixed. + if (SrcOpIdx1 != CommuteAnyOperandIndex && + (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + return false; + if (SrcOpIdx2 != CommuteAnyOperandIndex && + (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + return false; + + // Look for two different register operands assumed to be commutable + // regardless of the FMA opcode. The FMA opcode is adjusted later. + if (SrcOpIdx1 == CommuteAnyOperandIndex || + SrcOpIdx2 == CommuteAnyOperandIndex) { + unsigned CommutableOpIdx1 = SrcOpIdx1; + unsigned CommutableOpIdx2 = SrcOpIdx2; + + // At least one of operands to be commuted is not specified and + // this method is free to choose appropriate commutable operands. + if (SrcOpIdx1 == SrcOpIdx2) + // Both of operands are not fixed. By default set one of commutable + // operands to the last register operand of the instruction. + CommutableOpIdx2 = RegOpsNum; + else if (SrcOpIdx2 == CommuteAnyOperandIndex) + // Only one of operands is not fixed. + CommutableOpIdx2 = SrcOpIdx1; + + // CommutableOpIdx2 is well defined now. Let's choose another commutable + // operand and assign its index to CommutableOpIdx1. + unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + // The commuted operands must have different registers. + // Otherwise, the commute transformation does not change anything and + // is useless then. + if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + break; + } + + // No appropriate commutable operands were found. + if (CommutableOpIdx1 == 0) + return false; + + // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 + // to return those values. + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + } + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; +} + +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + unsigned Opc = MI->getOpcode(); + + // Define the array that holds FMA opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned RegularOpcodeGroups[][3] = { + { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, + { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, + { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, + { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, + { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, + { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, + { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, + { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, + { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, + { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, + + { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, + { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, + { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, + { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, + { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, + { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, + { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, + + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, + { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, + { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, + { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, + { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, + { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, + { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, + + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, + { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, + { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, + { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, + { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, + { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, + { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, + + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, + { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, + { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, + { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, + { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, + + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, + { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, + { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, + { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, + { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } + }; + + // Define the array that holds FMA*_Int opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned IntrinOpcodeGroups[][3] = { + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, + { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, + { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, + + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, + { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, + { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, + + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, + { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, + { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, + + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, + { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, + { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, + }; + + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; + const unsigned FormsNum = 3; + + bool IsIntrinOpcode; + isFMA3(Opc, &IsIntrinOpcode); + + size_t GroupsNum; + const unsigned (*OpcodeGroups)[3]; + if (IsIntrinOpcode) { + GroupsNum = array_lengthof(IntrinOpcodeGroups); + OpcodeGroups = IntrinOpcodeGroups; + } else { + GroupsNum = array_lengthof(RegularOpcodeGroups); + OpcodeGroups = RegularOpcodeGroups; + } + + const unsigned *FoundOpcodesGroup = nullptr; + size_t FormIndex; + + // Look for the input opcode in the corresponding opcodes table. + for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; + ++GroupIndex) { + for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { + if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { + FoundOpcodesGroup = OpcodeGroups[GroupIndex]; + break; + } + } } + + // The input opcode does not match with any of the opcodes from the tables. + // The unsupported FMA opcode must be added to one of the two opcode groups + // defined above. + assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (IsIntrinOpcode && SrcOpIdx1 == 1) + return 0; + + unsigned Case; + if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) + Case = 0; + else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) + Case = 1; + else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) + Case = 2; + else + return 0; + + // Define the FMA forms mapping array that helps to map input FMA form + // to output FMA form to preserve the operation semantics after + // commuting the operands. + static const unsigned FormMapping[][3] = { + // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; + // FMA132 A, C, b; ==> FMA231 C, A, b; + // FMA213 B, A, c; ==> FMA213 A, B, c; + // FMA231 C, A, b; ==> FMA132 A, C, b; + { Form231Index, Form213Index, Form132Index }, + // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; + // FMA132 A, c, B; ==> FMA132 B, c, A; + // FMA213 B, a, C; ==> FMA231 C, a, B; + // FMA231 C, a, B; ==> FMA213 B, a, C; + { Form132Index, Form231Index, Form213Index }, + // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; + // FMA132 a, C, B; ==> FMA213 a, B, C; + // FMA213 b, A, C; ==> FMA132 b, C, A; + // FMA231 c, A, B; ==> FMA231 c, B, A; + { Form213Index, Form132Index, Form231Index } + }; + + // Everything is ready, just adjust the FMA opcode and return it. + FormIndex = FormMapping[Case][FormIndex]; + return FoundOpcodesGroup[FormIndex]; } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, +bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { case X86::CMPPDrri: @@ -3141,46 +3631,22 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI->getOperand(3).getImm() & 0x7; switch (Imm) { - case 0x00: // EQUAL - case 0x03: // UNORDERED - case 0x04: // NOT EQUAL - case 0x07: // ORDERED - SrcOpIdx1 = 1; - SrcOpIdx2 = 2; - return true; + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + // The indices of the commutable operands are 1 and 2. + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); } return false; } - case X86::VFMADDPDr231r: - case X86::VFMADDPSr231r: - case X86::VFMADDSDr231r: - case X86::VFMADDSSr231r: - case X86::VFMSUBPDr231r: - case X86::VFMSUBPSr231r: - case X86::VFMSUBSDr231r: - case X86::VFMSUBSSr231r: - case X86::VFNMADDPDr231r: - case X86::VFNMADDPSr231r: - case X86::VFNMADDSDr231r: - case X86::VFNMADDSSr231r: - case X86::VFNMSUBPDr231r: - case X86::VFNMSUBPSr231r: - case X86::VFNMSUBSDr231r: - case X86::VFNMSUBSSr231r: - case X86::VFMADDPDr231rY: - case X86::VFMADDPSr231rY: - case X86::VFMSUBPDr231rY: - case X86::VFMSUBPSr231rY: - case X86::VFNMADDPDr231rY: - case X86::VFNMADDPSr231rY: - case X86::VFNMSUBPDr231rY: - case X86::VFNMSUBPSr231rY: - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; default: + if (isFMA3(MI->getOpcode())) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } + return false; } static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { @@ -3821,15 +4287,58 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } -inline static bool MaskRegClassContains(unsigned Reg) { +static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } + +static bool GRRegClassContains(unsigned Reg) { + return X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg) || + X86::GR16RegClass.contains(Reg) || + X86::GR8RegClass.contains(Reg); +} +static +unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVBrk; + } + if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVBkr; + } + return 0; +} + static -unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { +unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) + return X86::KMOVQkk; + if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) + return X86::KMOVDrk; + if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) + return X86::KMOVQrk; + if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + return X86::KMOVDkr; + if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) + return X86::KMOVQkr; + return 0; +} + +static +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, + const X86Subtarget &Subtarget) +{ + if (Subtarget.hasDQI()) + if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) + return Opc; + if (Subtarget.hasBWI()) + if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) + return Opc; if (X86::VR128XRegClass.contains(DestReg, SrcReg) || X86::VR256XRegClass.contains(DestReg, SrcReg) || X86::VR512RegClass.contains(DestReg, SrcReg)) { @@ -3837,21 +4346,14 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } - if (MaskRegClassContains(DestReg) && - MaskRegClassContains(SrcReg)) + if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) return X86::KMOVWkk; - if (MaskRegClassContains(DestReg) && - (X86::GR32RegClass.contains(SrcReg) || - X86::GR16RegClass.contains(SrcReg) || - X86::GR8RegClass.contains(SrcReg))) { - SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); + if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); return X86::KMOVWkr; } - if ((X86::GR32RegClass.contains(DestReg) || - X86::GR16RegClass.contains(DestReg) || - X86::GR8RegClass.contains(DestReg)) && - MaskRegClassContains(SrcReg)) { - DestReg = getX86SubSuperRegister(DestReg, MVT::i32); + if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); return X86::KMOVWrk; } return 0; @@ -3886,7 +4388,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; else if (HasAVX512) - Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) @@ -3900,34 +4402,86 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - // Moving EFLAGS to / from another register requires a push and a pop. - // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - clobbersTheStack. - if (SrcReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF64)); - BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); + bool FromEFLAGS = SrcReg == X86::EFLAGS; + bool ToEFLAGS = DestReg == X86::EFLAGS; + int Reg = FromEFLAGS ? DestReg : SrcReg; + bool is32 = X86::GR32RegClass.contains(Reg); + bool is64 = X86::GR64RegClass.contains(Reg); + + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int PopF = is64 ? X86::POPF64 : X86::POPF32; + int AX = is64 ? X86::RAX : X86::EAX; + + if (!Subtarget.hasLAHFSAHF()) { + assert(Subtarget.is64Bit() && + "Not having LAHF/SAHF only happens on 64-bit."); + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(PushF)); + BuildMI(MBB, MI, DL, get(Pop), DestReg); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Push)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(PopF)); + } return; } - if (X86::GR32RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF32)); - BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); - return; + + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is + // inefficient. Instead: + // - Save the overflow flag OF into AL using SETO, and restore it using a + // signed 8-bit addition of AL and INT8_MAX. + // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH + // using LAHF/SAHF. + // - When RAX/EAX is live and isn't the destination register, make sure it + // isn't clobbered by PUSH/POP'ing it before and after saving/restoring + // the flags. + // This approach is ~2.25x faster than using PUSHF/POPF. + // + // This is still somewhat inefficient because we don't know which flags are + // actually live inside EFLAGS. Were we able to do a single SETcc instead of + // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. + // + // PUSHF/POPF is also potentially incorrect because it affects other flags + // such as TF/IF/DF, which LLVM doesn't model. + // + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + + + bool AXDead = (Reg == AX) || + (MachineBasicBlock::LQR_Dead == + MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); + if (!AXDead) { + // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may + // actually be dead. This is not a problem for correctness as we are just + // (unnecessarily) saving+restoring a dead register. However the + // MachineVerifier expects operands that read from dead registers + // to be marked with the "undef" flag. + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); } - } - if (DestReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH64r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF64)); - return; + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); + BuildMI(MBB, MI, DL, get(X86::LAHF)); + BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); } - if (X86::GR32RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH32r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF32)); - return; + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) + .addReg(X86::AL) + .addImm(INT8_MAX); + BuildMI(MBB, MI, DL, get(X86::SAHF)); } + if (!AXDead) + BuildMI(MBB, MI, DL, get(Pop), AX); + return; } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) @@ -4602,9 +5156,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // live-out. If it is live-out, do not optimize. if ((IsCmpZero || IsSwapped) && !IsSafe) { MachineBasicBlock *MBB = CmpInstr->getParent(); - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *Successor : MBB->successors()) + if (Successor->isLiveIn(X86::EFLAGS)) return false; } @@ -4645,8 +5198,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. - for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++) - OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second)); + for (auto &Op : OpsToUpdate) + Op.first->setDesc(get(Op.second)); return true; } @@ -4694,8 +5247,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI); - if (FoldMI) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -4725,6 +5277,82 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, return true; } +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two %k0 reads. +/// This is used for mapping: +/// %k4 = K_SET1 +/// to: +/// %k4 = KXNORrr %k0, %k0 +static bool Expand2AddrKreg(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc, unsigned Reg) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + MIB->setDesc(Desc); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + return true; +} + +static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, + bool MinusOne) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + + // Insert the XOR. + BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + + // Turn the pseudo into an INC or DEC. + MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); + MIB.addReg(Reg); + + return true; +} + +bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + int64_t Imm = MIB->getOperand(1).getImm(); + assert(Imm != 0 && "Using push/pop for 0 is not efficient."); + MachineBasicBlock::iterator I = MIB.getInstr(); + + int StackAdjustment; + + if (Subtarget.is64Bit()) { + assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || + MIB->getOpcode() == X86::MOV32ImmSExti8); + // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and + // widen the register if necessary. + StackAdjustment = 8; + BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); + MIB->setDesc(get(X86::POP64r)); + MIB->getOperand(0) + .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); + } else { + assert(MIB->getOpcode() == X86::MOV32ImmSExti8); + StackAdjustment = 4; + BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); + MIB->setDesc(get(X86::POP32r)); + } + + // Build CFI if necessary. + MachineFunction &MF = *MBB.getParent(); + const X86FrameLowering *TFL = Subtarget.getFrameLowering(); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsDwarfCFI = + !IsWin64Prologue && + (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); + bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; + if (EmitCFI) { + TFL->BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); + TFL->BuildCFI(MBB, std::next(I), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); + } + + return true; +} + // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -4735,8 +5363,8 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; - MachineMemOperand *MMO = MBB.getParent()-> - getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8); + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) @@ -4753,6 +5381,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); + case X86::MOV32r1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); + case X86::MOV32r_1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); + case X86::MOV32ImmSExti8: + case X86::MOV64ImmSExti8: + return ExpandMOVImmSExti8(MIB); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -4777,10 +5412,22 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; + + // KNL does not recognize dependency-breaking idioms for mask registers, + // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. + // Using %k0 as the undef input register is a performance heuristic based + // on the assumption that %k0 is used less frequently than the other mask + // registers, since it is not usable as a write mask. + // FIXME: A more advanced approach would be to choose the best input mask + // register based on context. case X86::KSET0B: - case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); + case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); + case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); + case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); case X86::KSET1B: - case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); + case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); + case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); + case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; @@ -4788,12 +5435,28 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return false; } -static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) { +static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, + int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); - for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.addOperand(MOs[i]); - if (NumAddrOps < 4) // FrameIndex only - addOffset(MIB, 0); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + MIB.addDisp(MO, PtrOffset); + } else { + MIB.addOperand(MO); + } + } + } } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, @@ -4828,7 +5491,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, const TargetInstrInfo &TII) { + MachineInstr *MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); @@ -4838,7 +5502,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); - addOperands(MIB, MOs); + addOperands(MIB, MOs, PtrOffset); } else { MIB.addOperand(MO); } @@ -4860,6 +5524,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, return MIB.addImm(0); } +MachineInstr *X86InstrInfo::foldMemoryOperandCustom( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + }; + + return nullptr; +} + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, @@ -4869,10 +5567,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; - // For CPUs that favor the register form of a call, - // do not fold loads into calls. - if (isCallRegIndirect && - (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) + // For CPUs that favor the register form of a call or push, + // do not fold loads into calls or pushes, unless optimizing for size + // aggressively. + if (isCallRegIndirect && !MF.getFunction()->optForMinSize() && + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r || + MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r || + MI->getOpcode() == X86::PUSH64r)) return nullptr; unsigned NumOps = MI->getDesc().getNumOperands(); @@ -4886,6 +5587,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; MachineInstr *NewMI = nullptr; + + // Attempt to fold any custom cases we have. + if (MachineInstr *CustomMI = + foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return CustomMI; + // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. @@ -4963,60 +5670,56 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { - unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2; + unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); - bool Tied0 = - 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied2 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. - if ((HasDef && Reg0 == Reg1 && Tied0) || - (HasDef && Reg0 == Reg2 && Tied1)) + if ((HasDef && Reg0 == Reg1 && Tied1) || + (HasDef && Reg0 == Reg2 && Tied2)) return nullptr; - if ((CommuteOpIdx1 == OriginalOpIdx) || - (CommuteOpIdx2 == OriginalOpIdx)) { - MachineInstr *CommutedMI = commuteInstruction(MI, false); - if (!CommutedMI) { - // Unable to commute. - return nullptr; - } - if (CommutedMI != MI) { - // New instruction. We can't fold from this. - CommutedMI->eraseFromParent(); - return nullptr; - } + MachineInstr *CommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } - // Attempt to fold with the commuted version of the instruction. - unsigned CommuteOp = - (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); - NewMI = - foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align, - /*AllowCommute=*/false); - if (NewMI) - return NewMI; - - // Folding failed again - undo the commute before returning. - MachineInstr *UncommutedMI = commuteInstruction(MI, false); - if (!UncommutedMI) { - // Unable to commute. - return nullptr; - } - if (UncommutedMI != MI) { - // New instruction. It doesn't need to be kept. - UncommutedMI->eraseFromParent(); - return nullptr; - } + // Attempt to fold with the commuted version of the instruction. + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, + Size, Align, /*AllowCommute=*/false); + if (NewMI) + return NewMI; - // Return here to prevent duplicate fuse failure report. + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!UncommutedMI) { + // Unable to commute. return nullptr; } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; } } @@ -5208,13 +5911,14 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, // If MI kills this register, the false dependence is already broken. if (MI->killsRegister(Reg, TRI)) return; + if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. - bool HasAVX = Subtarget.hasAVX(); - unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; + unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + MI->addRegisterKilled(Reg, TRI, true); } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. @@ -5222,21 +5926,20 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); - } else - return; - MI->addRegisterKilled(Reg, TRI, true); + MI->addRegisterKilled(Reg, TRI, true); + } } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex) const { // Check switch flag - if (NoFusing) return nullptr; + if (NoFusing) + return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -5303,6 +6006,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: + case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: + case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int: return false; default: return true; @@ -5318,6 +6027,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: + case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: + case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int: return false; default: return true; @@ -5342,10 +6057,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Check switch flag if (NoFusing) return nullptr; - // Unless optimizing for size, don't fold to avoid partial - // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + // Avoid partial register update stalls unless optimizing for size. + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; // Determine the alignment of the load. @@ -5460,62 +6173,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( /*Size=*/0, Alignment, /*AllowCommute=*/true); } -bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - // Check switch flag - if (NoFusing) return 0; - - if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { - switch (MI->getOpcode()) { - default: return false; - case X86::TEST8rr: - case X86::TEST16rr: - case X86::TEST32rr: - case X86::TEST64rr: - return true; - case X86::ADD32ri: - // FIXME: AsmPrinter doesn't know how to handle - // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. - if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) - return false; - break; - } - } - - if (Ops.size() != 1) - return false; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - unsigned NumOps = MI->getDesc().getNumOperands(); - bool isTwoAddr = NumOps > 1 && - MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; - - // Folding a memory location into the two-address part of a two-address - // instruction is different than folding it other places. It requires - // replacing the *two* registers with the memory location. - const DenseMap<unsigned, - std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; - if (isTwoAddr && NumOps >= 2 && OpNum < 2) { - OpcodeTablePtr = &RegOp2MemOpTable2Addr; - } else if (OpNum == 0) { - if (Opc == X86::MOV32r0) - return true; - - OpcodeTablePtr = &RegOp2MemOpTable0; - } else if (OpNum == 1) { - OpcodeTablePtr = &RegOp2MemOpTable1; - } else if (OpNum == 2) { - OpcodeTablePtr = &RegOp2MemOpTable2; - } else if (OpNum == 3) { - OpcodeTablePtr = &RegOp2MemOpTable3; - } - - if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) - return true; - return TargetInstrInfo::canFoldMemoryOperand(MI, Ops); -} - bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr*> &NewMIs) const { @@ -5536,9 +6193,10 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + // TODO: Check if 32-byte or greater accesses are slow too? if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -5582,20 +6240,19 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, if (FoldedStore) MIB.addReg(Reg, RegState::Define); - for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) - MIB.addOperand(BeforeOps[i]); + for (MachineOperand &BeforeOp : BeforeOps) + MIB.addOperand(BeforeOp); if (FoldedLoad) MIB.addReg(Reg); - for (unsigned i = 0, e = AfterOps.size(); i != e; ++i) - MIB.addOperand(AfterOps[i]); - for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) { - MachineOperand &MO = ImpOps[i]; - MIB.addReg(MO.getReg(), - getDefRegState(MO.isDef()) | + for (MachineOperand &AfterOp : AfterOps) + MIB.addOperand(AfterOp); + for (MachineOperand &ImpOp : ImpOps) { + MIB.addReg(ImpOp.getReg(), + getDefRegState(ImpOp.isDef()) | RegState::Implicit | - getKillRegState(MO.isKill()) | - getDeadRegState(MO.isDead()) | - getUndefRegState(MO.isUndef())); + getKillRegState(ImpOp.isKill()) | + getDeadRegState(ImpOp.isDead()) | + getUndefRegState(ImpOp.isUndef())); } // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (DataMI->getOpcode()) { @@ -5686,9 +6343,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned load. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -5729,9 +6388,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned store. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -6192,16 +6853,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) - if (ReplaceableInstrs[i][domain-1] == opcode) - return ReplaceableInstrs[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrs) + if (Row[domain-1] == opcode) + return Row; return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) - if (ReplaceableInstrsAVX2[i][domain-1] == opcode) - return ReplaceableInstrsAVX2[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) + if (Row[domain-1] == opcode) + return Row; return nullptr; } @@ -6347,230 +7008,181 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel, return isHighLatencyDef(DefMI->getOpcode()); } -static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst, - const MachineBasicBlock *MBB) { - assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators"); - const MachineOperand &Op1 = Inst.getOperand(1); - const MachineOperand &Op2 = Inst.getOperand(2); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - // We need virtual register definitions. - MachineInstr *MI1 = nullptr; - MachineInstr *MI2 = nullptr; - if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) - MI1 = MRI.getUniqueVRegDef(Op1.getReg()); - if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) - MI2 = MRI.getUniqueVRegDef(Op2.getReg()); - - // And they need to be in the trace (otherwise, they won't have a depth). - if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB) - return true; +bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const { + assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) && + "Reassociation needs binary operators"); + + // Integer binary math/logic instructions have a third source operand: + // the EFLAGS register. That operand must be both defined here and never + // used; ie, it must be dead. If the EFLAGS operand is live, then we can + // not change anything because rearranging the operands could affect other + // instructions that depend on the exact status flags (zero, sign, etc.) + // that are set by using these particular operands with this operation. + if (Inst.getNumOperands() == 4) { + assert(Inst.getOperand(3).isReg() && + Inst.getOperand(3).getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + if (!Inst.getOperand(3).isDead()) + return false; + } - return false; -} - -static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) { - const MachineBasicBlock *MBB = Inst.getParent(); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); - MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); - unsigned AssocOpcode = Inst.getOpcode(); - - // If only one operand has the same opcode and it's the second source operand, - // the operands must be commuted. - Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; - if (Commuted) - std::swap(MI1, MI2); - - // 1. The previous instruction must be the same type as Inst. - // 2. The previous instruction must have virtual register definitions for its - // operands in the same basic block as Inst. - // 3. The previous instruction's result must only be used by Inst. - if (MI1->getOpcode() == AssocOpcode && - hasVirtualRegDefsInBasicBlock(*MI1, MBB) && - MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) - return true; - - return false; + return TargetInstrInfo::hasReassociableOperands(Inst, MBB); } // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) -// 2. Other math / logic operations (and, or) -static bool isAssociativeAndCommutative(unsigned Opcode) { - switch (Opcode) { +// 2. Other math / logic operations (xor, or) +// 3. Other forms of the same operation (intrinsics and other variants) +bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case X86::AND8rr: + case X86::AND16rr: + case X86::AND32rr: + case X86::AND64rr: + case X86::OR8rr: + case X86::OR16rr: + case X86::OR32rr: + case X86::OR64rr: + case X86::XOR8rr: + case X86::XOR16rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::IMUL16rr: + case X86::IMUL32rr: + case X86::IMUL64rr: + case X86::PANDrr: + case X86::PORrr: + case X86::PXORrr: + case X86::VPANDrr: + case X86::VPANDYrr: + case X86::VPORrr: + case X86::VPORYrr: + case X86::VPXORrr: + case X86::VPXORYrr: + // Normal min/max instructions are not commutative because of NaN and signed + // zero semantics, but these are. Thus, there's no need to check for global + // relaxed math; the instructions themselves have the properties we need. + case X86::MAXCPDrr: + case X86::MAXCPSrr: + case X86::MAXCSDrr: + case X86::MAXCSSrr: + case X86::MINCPDrr: + case X86::MINCPSrr: + case X86::MINCSDrr: + case X86::MINCSSrr: + case X86::VMAXCPDrr: + case X86::VMAXCPSrr: + case X86::VMAXCPDYrr: + case X86::VMAXCPSYrr: + case X86::VMAXCSDrr: + case X86::VMAXCSSrr: + case X86::VMINCPDrr: + case X86::VMINCPSrr: + case X86::VMINCPDYrr: + case X86::VMINCPSYrr: + case X86::VMINCSDrr: + case X86::VMINCSSrr: + return true; + case X86::ADDPDrr: + case X86::ADDPSrr: case X86::ADDSDrr: case X86::ADDSSrr: - case X86::VADDSDrr: - case X86::VADDSSrr: + case X86::MULPDrr: + case X86::MULPSrr: case X86::MULSDrr: case X86::MULSSrr: + case X86::VADDPDrr: + case X86::VADDPSrr: + case X86::VADDPDYrr: + case X86::VADDPSYrr: + case X86::VADDSDrr: + case X86::VADDSSrr: + case X86::VMULPDrr: + case X86::VMULPSrr: + case X86::VMULPDYrr: + case X86::VMULPSYrr: case X86::VMULSDrr: case X86::VMULSSrr: - return true; + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; default: return false; } } -/// Return true if the input instruction is part of a chain of dependent ops -/// that are suitable for reassociation, otherwise return false. -/// If the instruction's operands must be commuted to have a previous -/// instruction of the same type define the first source operand, Commuted will -/// be set to true. -static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) { - // 1. The operation must be associative and commutative. - // 2. The instruction must have virtual register definitions for its - // operands in the same basic block. - // 3. The instruction must have a reassociable sibling. - if (isAssociativeAndCommutative(Inst.getOpcode()) && - hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) && - hasReassocSibling(Inst, Commuted)) - return true; - - return false; -} - -// FIXME: This has the potential to be expensive (compile time) while not -// improving the code at all. Some ways to limit the overhead: -// 1. Track successful transforms; bail out if hit rate gets too low. -// 2. Only enable at -O3 or some other non-default optimization level. -// 3. Pre-screen pattern candidates here: if an operand of the previous -// instruction is known to not increase the critical path, then don't match -// that pattern. -bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { - if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) - return false; - - // TODO: There is nothing x86-specific here except the instruction type. - // This logic could be hoisted into the machine combiner pass itself. - - // Look for this reassociation pattern: - // B = A op X (Prev) - // C = B op Y (Root) - - bool Commute; - if (isReassocCandidate(Root, Commute)) { - // We found a sequence of instructions that may be suitable for a - // reassociation of operands to increase ILP. Specify each commutation - // possibility for the Prev instruction in the sequence and let the - // machine combiner decide if changing the operands is worthwhile. - if (Commute) { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB); - } else { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY); - } - return true; - } +/// This is an architecture-specific helper function of reassociateOps. +/// Set special operand attributes for new instructions after reassociation. +void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, + MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const { + // Integer instructions define an implicit EFLAGS source register operand as + // the third source (fourth total) operand. + if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4) + return; - return false; + assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 && + "Unexpected instruction type for reassociation"); + + MachineOperand &OldOp1 = OldMI1.getOperand(3); + MachineOperand &OldOp2 = OldMI2.getOperand(3); + MachineOperand &NewOp1 = NewMI1.getOperand(3); + MachineOperand &NewOp2 = NewMI2.getOperand(3); + + assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + + (void)OldOp1; + (void)OldOp2; + + assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + + // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations + // of this pass or other passes. The EFLAGS operands must be dead in these new + // instructions because the EFLAGS operands in the original instructions must + // be dead in order for reassociation to occur. + NewOp1.setIsDead(); + NewOp2.setIsDead(); } -/// Attempt the following reassociation to reduce critical path length: -/// B = A op X (Prev) -/// C = B op Y (Root) -/// ===> -/// B = X op Y -/// C = A op B -static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { - MachineFunction *MF = Root.getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); - - // This array encodes the operand index for each parameter because the - // operands may be commuted. Each row corresponds to a pattern value, - // and each column specifies the index of A, B, X, Y. - unsigned OpIdx[4][4] = { - { 1, 1, 2, 2 }, - { 1, 2, 2, 1 }, - { 2, 1, 1, 2 }, - { 2, 2, 1, 1 } - }; - - MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]); - MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]); - MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]); - MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]); - MachineOperand &OpC = Root.getOperand(0); - - unsigned RegA = OpA.getReg(); - unsigned RegB = OpB.getReg(); - unsigned RegX = OpX.getReg(); - unsigned RegY = OpY.getReg(); - unsigned RegC = OpC.getReg(); - - if (TargetRegisterInfo::isVirtualRegister(RegA)) - MRI.constrainRegClass(RegA, RC); - if (TargetRegisterInfo::isVirtualRegister(RegB)) - MRI.constrainRegClass(RegB, RC); - if (TargetRegisterInfo::isVirtualRegister(RegX)) - MRI.constrainRegClass(RegX, RC); - if (TargetRegisterInfo::isVirtualRegister(RegY)) - MRI.constrainRegClass(RegY, RC); - if (TargetRegisterInfo::isVirtualRegister(RegC)) - MRI.constrainRegClass(RegC, RC); - - // Create a new virtual register for the result of (X op Y) instead of - // recycling RegB because the MachineCombiner's computation of the critical - // path requires a new register definition rather than an existing one. - unsigned NewVR = MRI.createVirtualRegister(RC); - InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); - - unsigned Opcode = Root.getOpcode(); - bool KillA = OpA.isKill(); - bool KillX = OpX.isKill(); - bool KillY = OpY.isKill(); - - // Create new instructions for insertion. - MachineInstrBuilder MIB1 = - BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR) - .addReg(RegX, getKillRegState(KillX)) - .addReg(RegY, getKillRegState(KillY)); - InsInstrs.push_back(MIB1); - - MachineInstrBuilder MIB2 = - BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) - .addReg(RegA, getKillRegState(KillA)) - .addReg(NewVR, getKillRegState(true)); - InsInstrs.push_back(MIB2); - - // Record old instructions for deletion. - DelInstrs.push_back(&Prev); - DelInstrs.push_back(&Root); +std::pair<unsigned, unsigned> +X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF, 0u); } -void X86InstrInfo::genAlternativeCodeSequence( - MachineInstr &Root, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const { - MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo(); - - // Select the previous instruction in the sequence based on the input pattern. - MachineInstr *Prev = nullptr; - switch (Pattern) { - case MachineCombinerPattern::MC_REASSOC_AX_BY: - case MachineCombinerPattern::MC_REASSOC_XA_BY: - Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); - break; - case MachineCombinerPattern::MC_REASSOC_AX_YB: - case MachineCombinerPattern::MC_REASSOC_XA_YB: - Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); - } - assert(Prev && "Unknown pattern for machine combiner"); - - reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); - return; +ArrayRef<std::pair<unsigned, const char *>> +X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace X86II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"}, + {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"}, + {MO_GOT, "x86-got"}, + {MO_GOTOFF, "x86-gotoff"}, + {MO_GOTPCREL, "x86-gotpcrel"}, + {MO_PLT, "x86-plt"}, + {MO_TLSGD, "x86-tlsgd"}, + {MO_TLSLD, "x86-tlsld"}, + {MO_TLSLDM, "x86-tlsldm"}, + {MO_GOTTPOFF, "x86-gottpoff"}, + {MO_INDNTPOFF, "x86-indntpoff"}, + {MO_TPOFF, "x86-tpoff"}, + {MO_DTPOFF, "x86-dtpoff"}, + {MO_NTPOFF, "x86-ntpoff"}, + {MO_GOTNTPOFF, "x86-gotntpoff"}, + {MO_DLLIMPORT, "x86-dllimport"}, + {MO_DARWIN_STUB, "x86-darwin-stub"}, + {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, + {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, + {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"}, + {MO_TLVP, "x86-tlvp"}, + {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, + {MO_SECREL, "x86-secrel"}}; + return makeArrayRef(TargetFlags); } namespace { |