diff options
Diffstat (limited to 'lib/Transforms/Utils/SimplifyCFG.cpp')
-rw-r--r-- | lib/Transforms/Utils/SimplifyCFG.cpp | 203 |
1 files changed, 189 insertions, 14 deletions
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index f02f80cc1b78..b3c80424c8b9 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -127,6 +127,16 @@ static cl::opt<unsigned> MaxSpeculationDepth( cl::desc("Limit maximum recursion depth when calculating costs of " "speculatively executed instructions")); +static cl::opt<unsigned> DependenceChainLatency( + "dependence-chain-latency", cl::Hidden, cl::init(8), + cl::desc("Limit the maximum latency of dependence chain containing cmp " + "for if conversion")); + +static cl::opt<unsigned> SmallBBSize( + "small-bb-size", cl::Hidden, cl::init(40), + cl::desc("Check dependence chain latency only in basic block smaller than " + "this number")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLinearMaps, "Number of switch instructions turned into linear mapping"); @@ -395,6 +405,166 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, return true; } +/// Estimate the code size of the specified BB. +static unsigned CountBBCodeSize(BasicBlock *BB, + const TargetTransformInfo &TTI) { + unsigned Size = 0; + for (auto II = BB->begin(); !isa<TerminatorInst>(II); ++II) + Size += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_CodeSize); + return Size; +} + +/// Find out the latency of the longest dependence chain in the BB if +/// LongestChain is true, or the dependence chain containing the compare +/// instruction feeding the block's conditional branch. +static unsigned FindDependenceChainLatency(BasicBlock *BB, + DenseMap<Instruction *, unsigned> &Instructions, + const TargetTransformInfo &TTI, + bool LongestChain) { + unsigned MaxLatency = 0; + + BasicBlock::iterator II; + for (II = BB->begin(); !isa<TerminatorInst>(II); ++II) { + unsigned Latency = 0; + for (unsigned O = 0, E = II->getNumOperands(); O != E; ++O) { + Instruction *Op = dyn_cast<Instruction>(II->getOperand(O)); + if (Op && Instructions.count(Op)) { + auto OpLatency = Instructions[Op]; + if (OpLatency > Latency) + Latency = OpLatency; + } + } + Latency += TTI.getInstructionCost(&(*II), TargetTransformInfo::TCK_Latency); + Instructions[&(*II)] = Latency; + + if (Latency > MaxLatency) + MaxLatency = Latency; + } + + if (LongestChain) + return MaxLatency; + + // The length of the dependence chain containing the compare instruction is + // wanted, so the terminator must be a BranchInst. + assert(isa<BranchInst>(II)); + BranchInst* Br = cast<BranchInst>(II); + Instruction *Cmp = dyn_cast<Instruction>(Br->getCondition()); + if (Cmp && Instructions.count(Cmp)) + return Instructions[Cmp]; + else + return 0; +} + +/// Instructions in BB2 may depend on instructions in BB1, and instructions +/// in BB1 may have users in BB2. If the last (in terms of latency) such kind +/// of instruction in BB1 is I, then the instructions after I can be executed +/// in parallel with instructions in BB2. +/// This function returns the latency of I. +static unsigned LatencyAdjustment(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + DenseMap<Instruction *, unsigned> &BB1Instructions) { + unsigned LastLatency = 0; + SmallVector<Instruction *, 16> Worklist; + BasicBlock::iterator II; + for (II = BB2->begin(); !isa<TerminatorInst>(II); ++II) { + if (PHINode *PN = dyn_cast<PHINode>(II)) { + // Look for users in BB2. + bool InBBUser = false; + for (User *U : PN->users()) { + if (cast<Instruction>(U)->getParent() == BB2) { + InBBUser = true; + break; + } + } + // No such user, we don't care about this instruction and its operands. + if (!InBBUser) + break; + } + Worklist.push_back(&(*II)); + } + + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + for (unsigned O = 0, E = I->getNumOperands(); O != E; ++O) { + if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(O))) { + if (Op->getParent() == IfBlock1 || Op->getParent() == IfBlock2) + Worklist.push_back(Op); + else if (Op->getParent() == BB1 && BB1Instructions.count(Op)) { + if (BB1Instructions[Op] > LastLatency) + LastLatency = BB1Instructions[Op]; + } + } + } + } + + return LastLatency; +} + +/// If after if conversion, most of the instructions in this new BB construct a +/// long and slow dependence chain, it may be slower than cmp/branch, even +/// if the branch has a high miss rate, because the control dependence is +/// transformed into data dependence, and control dependence can be speculated, +/// and thus, the second part can execute in parallel with the first part on +/// modern OOO processor. +/// +/// To check this condition, this function finds the length of the dependence +/// chain in BB1 (only the part that can be executed in parallel with code after +/// branch in BB2) containing cmp, and if the length is longer than a threshold, +/// don't perform if conversion. +/// +/// BB1, BB2, IfBlock1 and IfBlock2 are candidate BBs for if conversion. +/// SpeculationSize contains the code size of IfBlock1 and IfBlock2. +static bool FindLongDependenceChain(BasicBlock *BB1, BasicBlock *BB2, + BasicBlock *IfBlock1, BasicBlock *IfBlock2, + unsigned SpeculationSize, + const TargetTransformInfo &TTI) { + // Accumulated latency of each instruction in their BBs. + DenseMap<Instruction *, unsigned> BB1Instructions; + DenseMap<Instruction *, unsigned> BB2Instructions; + + if (!TTI.isOutOfOrder()) + return false; + + unsigned NewBBSize = CountBBCodeSize(BB1, TTI) + CountBBCodeSize(BB2, TTI) + + SpeculationSize; + + // We check small BB only since it is more difficult to find unrelated + // instructions to fill functional units in a small BB. + if (NewBBSize > SmallBBSize) + return false; + + auto BB1Chain = + FindDependenceChainLatency(BB1, BB1Instructions, TTI, false); + auto BB2Chain = + FindDependenceChainLatency(BB2, BB2Instructions, TTI, true); + + // If there are many unrelated instructions in the new BB, there will be + // other instructions for the processor to issue regardless of the length + // of this new dependence chain. + // Modern processors can issue 3 or more instructions in each cycle. But in + // real world applications, an IPC of 2 is already very good for non-loop + // code with small basic blocks. Higher IPC is usually found in programs with + // small kernel. So IPC of 2 is more reasonable for most applications. + if ((BB1Chain + BB2Chain) * 2 <= NewBBSize) + return false; + + // We only care about part of the dependence chain in BB1 that can be + // executed in parallel with BB2, so adjust the latency. + BB1Chain -= + LatencyAdjustment(BB1, BB2, IfBlock1, IfBlock2, BB1Instructions); + + // Correctly predicted branch instruction can skip the dependence chain in + // BB1, but misprediction has a penalty, so only when the dependence chain is + // longer than DependenceChainLatency, then branch is better than select. + // Besides misprediction penalty, the threshold value DependenceChainLatency + // also depends on branch misprediction rate, taken branch latency and cmov + // latency. + if (BB1Chain >= DependenceChainLatency) + return true; + + return false; +} + /// Extract ConstantInt from value, looking through IntToPtr /// and PointerNullValue. Return NULL if value is not a constant int. static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) { @@ -1654,14 +1824,11 @@ namespace { } // end anonymous namespace -/// Given an unconditional branch that goes to BBEnd, -/// check whether BBEnd has only two predecessors and the other predecessor -/// ends with an unconditional branch. If it is true, sink any common code -/// in the two predecessors to BBEnd. -static bool SinkThenElseCodeToEnd(BranchInst *BI1) { - assert(BI1->isUnconditional()); - BasicBlock *BBEnd = BI1->getSuccessor(0); - +/// Check whether BB's predecessors end with unconditional branches. If it is +/// true, sink any common code from the predecessors to BB. +/// We also allow one predecessor to end with conditional branch (but no more +/// than one). +static bool SinkCommonCodeFromPredecessors(BasicBlock *BB) { // We support two situations: // (1) all incoming arcs are unconditional // (2) one incoming arc is conditional @@ -1705,7 +1872,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { // SmallVector<BasicBlock*,4> UnconditionalPreds; Instruction *Cond = nullptr; - for (auto *B : predecessors(BBEnd)) { + for (auto *B : predecessors(BB)) { auto *T = B->getTerminator(); if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional()) UnconditionalPreds.push_back(B); @@ -1773,8 +1940,7 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) { DEBUG(dbgs() << "SINK: Splitting edge\n"); // We have a conditional edge and we're going to sink some instructions. // Insert a new block postdominating all blocks we're going to sink from. - if (!SplitBlockPredecessors(BI1->getSuccessor(0), UnconditionalPreds, - ".sink.split")) + if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split")) // Edges couldn't be split. return false; Changed = true; @@ -2048,6 +2214,11 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue)) return false; + // Don't do if conversion for long dependence chain. + if (FindLongDependenceChain(BB, EndBB, ThenBB, nullptr, + CountBBCodeSize(ThenBB, TTI), TTI)) + return false; + // If we get here, we can hoist the instruction and if-convert. DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";); @@ -2355,6 +2526,10 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } } + if (FindLongDependenceChain(DomBlock, BB, IfBlock1, IfBlock2, + AggressiveInsts.size(), TTI)) + return false; + DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: " << IfTrue->getName() << " F: " << IfFalse->getName() << "\n"); @@ -5728,9 +5903,6 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, BasicBlock *BB = BI->getParent(); BasicBlock *Succ = BI->getSuccessor(0); - if (SinkCommon && Options.SinkCommonInsts && SinkThenElseCodeToEnd(BI)) - return true; - // If the Terminator is the only non-phi instruction, simplify the block. // If LoopHeader is provided, check if the block or its successor is a loop // header. (This is for early invocations before loop simplify and @@ -6008,6 +6180,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { if (MergeBlockIntoPredecessor(BB)) return true; + if (SinkCommon && Options.SinkCommonInsts) + Changed |= SinkCommonCodeFromPredecessors(BB); + IRBuilder<> Builder(BB); // If there is a trivial two-entry PHI node in this basic block, and we can |