diff options
Diffstat (limited to 'lib/Target/X86/X86VZeroUpper.cpp')
-rw-r--r-- | lib/Target/X86/X86VZeroUpper.cpp | 234 |
1 files changed, 207 insertions, 27 deletions
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 39584942468d..2fd78a7231c6 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -14,14 +14,16 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "x86-codegen" +#define DEBUG_TYPE "x86-vzeroupper" #include "X86.h" #include "X86InstrInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/GlobalValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; @@ -41,6 +43,60 @@ namespace { private: const TargetInstrInfo *TII; // Machine instruction info. MachineBasicBlock *MBB; // Current basic block + + // Any YMM register live-in to this function? + bool FnHasLiveInYmm; + + // BBState - Contains the state of each MBB: unknown, clean, dirty + SmallVector<uint8_t, 8> BBState; + + // BBSolved - Keep track of all MBB which had been already analyzed + // and there is no further processing required. + BitVector BBSolved; + + // Machine Basic Blocks are classified according this pass: + // + // ST_UNKNOWN - The MBB state is unknown, meaning from the entry state + // until the MBB exit there isn't a instruction using YMM to change + // the state to dirty, or one of the incoming predecessors is unknown + // and there's not a dirty predecessor between them. + // + // ST_CLEAN - No YMM usage in the end of the MBB. A MBB could have + // instructions using YMM and be marked ST_CLEAN, as long as the state + // is cleaned by a vzeroupper before any call. + // + // ST_DIRTY - Any MBB ending with a YMM usage not cleaned up by a + // vzeroupper instruction. + // + // ST_INIT - Placeholder for an empty state set + // + enum { + ST_UNKNOWN = 0, + ST_CLEAN = 1, + ST_DIRTY = 2, + ST_INIT = 3 + }; + + // computeState - Given two states, compute the resulting state, in + // the following way + // + // 1) One dirty state yields another dirty state + // 2) All states must be clean for the result to be clean + // 3) If none above and one unknown, the result state is also unknown + // + unsigned computeState(unsigned PrevState, unsigned CurState) { + if (PrevState == ST_INIT) + return CurState; + + if (PrevState == ST_DIRTY || CurState == ST_DIRTY) + return ST_DIRTY; + + if (PrevState == ST_CLEAN && CurState == ST_CLEAN) + return ST_CLEAN; + + return ST_UNKNOWN; + } + }; char VZeroUpperInserter::ID = 0; } @@ -49,37 +105,82 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); } +static bool isYmmReg(unsigned Reg) { + if (Reg >= X86::YMM0 && Reg <= X86::YMM15) + return true; + + return false; +} + +static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { + for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), + E = MRI.livein_end(); I != E; ++I) + if (isYmmReg(I->first)) + return true; + + return false; +} + +static bool hasYmmReg(MachineInstr *MI) { + for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.isDebug()) + continue; + if (isYmmReg(MO.getReg())) + return true; + } + return false; +} + /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// vzero upper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { TII = MF.getTarget().getInstrInfo(); - bool Changed = false; - - // Process any unreachable blocks in arbitrary order now. - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - Changed |= processBasicBlock(MF, *BB); + MachineRegisterInfo &MRI = MF.getRegInfo(); + bool EverMadeChange = false; - return Changed; -} + // Fast check: if the function doesn't use any ymm registers, we don't need + // to insert any VZEROUPPER instructions. This is constant-time, so it is + // cheap in the common case of no ymm use. + bool YMMUsed = false; + const TargetRegisterClass *RC = X86::VR256RegisterClass; + for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); + i != e; i++) { + if (MRI.isPhysRegUsed(*i)) { + YMMUsed = true; + break; + } + } + if (!YMMUsed) + return EverMadeChange; -static bool isCallToModuleFn(const MachineInstr *MI) { - assert(MI->getDesc().isCall() && "Isn't a call instruction"); + // Pre-compute the existence of any live-in YMM registers to this function + FnHasLiveInYmm = checkFnHasLiveInYmm(MRI); - for (int i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + assert(BBState.empty()); + BBState.resize(MF.getNumBlockIDs(), 0); + BBSolved.resize(MF.getNumBlockIDs(), 0); - if (!MO.isGlobal()) - continue; + // Each BB state depends on all predecessors, loop over until everything + // converges. (Once we converge, we can implicitly mark everything that is + // still ST_UNKNOWN as ST_CLEAN.) + while (1) { + bool MadeChange = false; - const GlobalValue *GV = MO.getGlobal(); - GlobalValue::LinkageTypes LT = GV->getLinkage(); - if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) || - (GV->isExternalLinkage(LT) && !GV->isDeclaration())) - return true; + // Process all basic blocks. + for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) + MadeChange |= processBasicBlock(MF, *I); - return false; + // If this iteration over the code changed anything, keep iterating. + if (!MadeChange) break; + EverMadeChange = true; } - return false; + + BBState.clear(); + BBSolved.clear(); + return EverMadeChange; } /// processBasicBlock - Loop over all of the instructions in the basic block, @@ -87,19 +188,98 @@ static bool isCallToModuleFn(const MachineInstr *MI) { bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { bool Changed = false; + unsigned BBNum = BB.getNumber(); MBB = &BB; + // Don't process already solved BBs + if (BBSolved[BBNum]) + return false; // No changes + + // Check the state of all predecessors + unsigned EntryState = ST_INIT; + for (MachineBasicBlock::const_pred_iterator PI = BB.pred_begin(), + PE = BB.pred_end(); PI != PE; ++PI) { + EntryState = computeState(EntryState, BBState[(*PI)->getNumber()]); + if (EntryState == ST_DIRTY) + break; + } + + + // The entry MBB for the function may set the inital state to dirty if + // the function receives any YMM incoming arguments + if (MBB == MF.begin()) { + EntryState = ST_CLEAN; + if (FnHasLiveInYmm) + EntryState = ST_DIRTY; + } + + // The current state is initialized according to the predecessors + unsigned CurState = EntryState; + bool BBHasCall = false; + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { MachineInstr *MI = I; DebugLoc dl = I->getDebugLoc(); + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. + if (!isControlFlow && CurState == ST_DIRTY) + continue; + + if (hasYmmReg(MI)) { + // We found a ymm-using instruction; this could be an AVX instruction, + // or it could be control flow. + CurState = ST_DIRTY; + continue; + } - // Insert a vzeroupper instruction before each control transfer - // to functions outside this module - if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) { - BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); - ++NumVZU; + // Check for control-flow out of the current function (which might + // indirectly execute SSE instructions). + if (!isControlFlow) + continue; + + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX + // registers. This instruction has zero latency. In addition, the processor + // changes back to Clean state, after which execution of Intel SSE + // instructions or Intel AVX instructions has no transition penalty. Add + // the VZEROUPPER instruction before any function call/return that might + // execute SSE code. + // FIXME: In some cases, we may want to move the VZEROUPPER into a + // predecessor block. + if (CurState == ST_DIRTY) { + // Only insert the VZEROUPPER in case the entry state isn't unknown. + // When unknown, only compute the information within the block to have + // it available in the exit if possible, but don't change the block. + if (EntryState != ST_UNKNOWN) { + BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER)); + ++NumVZU; + } + + // After the inserted VZEROUPPER the state becomes clean again, but + // other YMM may appear before other subsequent calls or even before + // the end of the BB. + CurState = ST_CLEAN; } } + DEBUG(dbgs() << "MBB #" << BBNum + << ", current state: " << CurState << '\n'); + + // A BB can only be considered solved when we both have done all the + // necessary transformations, and have computed the exit state. This happens + // in two cases: + // 1) We know the entry state: this immediately implies the exit state and + // all the necessary transformations. + // 2) There are no calls, and and a non-call instruction marks this block: + // no transformations are necessary, and we know the exit state. + if (EntryState != ST_UNKNOWN || (!BBHasCall && CurState != ST_UNKNOWN)) + BBSolved[BBNum] = true; + + if (CurState != BBState[BBNum]) + Changed = true; + + BBState[BBNum] = CurState; return Changed; } |