diff options
Diffstat (limited to 'lib/Transforms/Scalar/LoopDistribute.cpp')
-rw-r--r-- | lib/Transforms/Scalar/LoopDistribute.cpp | 398 |
1 files changed, 276 insertions, 122 deletions
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp index 3d3cf3e2890b..7eca28ed2bb7 100644 --- a/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/lib/Transforms/Scalar/LoopDistribute.cpp @@ -22,12 +22,17 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/LoopDistribute.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPassManager.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -60,6 +65,19 @@ static cl::opt<unsigned> DistributeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed for Loop " "Distribution")); +static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( + "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), + cl::Hidden, + cl::desc( + "The maximum number of SCEV checks allowed for Loop " + "Distribution for loop marked with #pragma loop distribute(enable)")); + +// Note that the initial value for this depends on whether the pass is invoked +// directly or from the optimization pipeline. +static cl::opt<bool> EnableLoopDistribute( + "enable-loop-distribute", cl::Hidden, + cl::desc("Enable the new, experimental LoopDistribution Pass")); + STATISTIC(NumLoopsDistributed, "Number of loops distributed"); namespace { @@ -170,7 +188,7 @@ public: // Delete the instructions backwards, as it has a reduced likelihood of // having to update as many def-use and use-def chains. - for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) { + for (auto *Inst : reverse(Unused)) { if (!Inst->use_empty()) Inst->replaceAllUsesWith(UndefValue::get(Inst->getType())); Inst->eraseFromParent(); @@ -571,121 +589,39 @@ private: AccessesType Accesses; }; -/// \brief The pass class. -class LoopDistribute : public FunctionPass { +/// \brief The actual class performing the per-loop work. +class LoopDistributeForLoop { public: - LoopDistribute() : FunctionPass(ID) { - initializeLoopDistributePass(*PassRegistry::getPassRegistry()); - } - - bool runOnFunction(Function &F) override { - LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - LAA = &getAnalysis<LoopAccessAnalysis>(); - DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - - // Build up a worklist of inner-loops to vectorize. This is necessary as the - // act of distributing a loop creates new loops and can invalidate iterators - // across the loops. - SmallVector<Loop *, 8> Worklist; - - for (Loop *TopLevelLoop : *LI) - for (Loop *L : depth_first(TopLevelLoop)) - // We only handle inner-most loops. - if (L->empty()) - Worklist.push_back(L); - - // Now walk the identified inner loops. - bool Changed = false; - for (Loop *L : Worklist) - Changed |= processLoop(L); - - // Process each loop nest in the function. - return Changed; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<ScalarEvolutionWrapperPass>(); - AU.addRequired<LoopInfoWrapperPass>(); - AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<LoopAccessAnalysis>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addPreserved<DominatorTreeWrapperPass>(); - } - - static char ID; - -private: - /// \brief Filter out checks between pointers from the same partition. - /// - /// \p PtrToPartition contains the partition number for pointers. Partition - /// number -1 means that the pointer is used in multiple partitions. In this - /// case we can't safely omit the check. - SmallVector<RuntimePointerChecking::PointerCheck, 4> - includeOnlyCrossPartitionChecks( - const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, - const SmallVectorImpl<int> &PtrToPartition, - const RuntimePointerChecking *RtPtrChecking) { - SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; - - std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), - [&](const RuntimePointerChecking::PointerCheck &Check) { - for (unsigned PtrIdx1 : Check.first->Members) - for (unsigned PtrIdx2 : Check.second->Members) - // Only include this check if there is a pair of pointers - // that require checking and the pointers fall into - // separate partitions. - // - // (Note that we already know at this point that the two - // pointer groups need checking but it doesn't follow - // that each pair of pointers within the two groups need - // checking as well. - // - // In other words we don't want to include a check just - // because there is a pair of pointers between the two - // pointer groups that require checks and a different - // pair whose pointers fall into different partitions.) - if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && - !RuntimePointerChecking::arePointersInSamePartition( - PtrToPartition, PtrIdx1, PtrIdx2)) - return true; - return false; - }); - - return Checks; + LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE) + : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) { + setForced(); } /// \brief Try to distribute an inner-most loop. - bool processLoop(Loop *L) { + bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) { assert(L->empty() && "Only process inner loops."); DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName() << "\" checking " << *L << "\n"); BasicBlock *PH = L->getLoopPreheader(); - if (!PH) { - DEBUG(dbgs() << "Skipping; no preheader"); - return false; - } - if (!L->getExitBlock()) { - DEBUG(dbgs() << "Skipping; multiple exit blocks"); - return false; - } - // LAA will check that we only have a single exiting block. + if (!PH) + return fail("no preheader"); + if (!L->getExitBlock()) + return fail("multiple exit blocks"); - const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap()); + // LAA will check that we only have a single exiting block. + LAI = &GetLAA(*L); // Currently, we only distribute to isolate the part of the loop with // dependence cycles to enable partial vectorization. - if (LAI.canVectorizeMemory()) { - DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization"); - return false; - } - auto *Dependences = LAI.getDepChecker().getDependences(); - if (!Dependences || Dependences->empty()) { - DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate"); - return false; - } + if (LAI->canVectorizeMemory()) + return fail("memory operations are safe for vectorization"); + + auto *Dependences = LAI->getDepChecker().getDependences(); + if (!Dependences || Dependences->empty()) + return fail("no unsafe dependences to isolate"); InstPartitionContainer Partitions(L, LI, DT); @@ -708,7 +644,7 @@ private: // NumUnsafeDependencesActive > 0 indicates this situation and in this case // we just keep assigning to the same cyclic partition until // NumUnsafeDependencesActive reaches 0. - const MemoryDepChecker &DepChecker = LAI.getDepChecker(); + const MemoryDepChecker &DepChecker = LAI->getDepChecker(); MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), *Dependences); @@ -738,14 +674,14 @@ private: DEBUG(dbgs() << "Seeded partitions:\n" << Partitions); if (Partitions.getSize() < 2) - return false; + return fail("cannot isolate unsafe dependencies"); // Run the merge heuristics: Merge non-cyclic adjacent partitions since we // should be able to vectorize these together. Partitions.mergeBeforePopulating(); DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions); if (Partitions.getSize() < 2) - return false; + return fail("cannot isolate unsafe dependencies"); // Now, populate the partitions with non-memory operations. Partitions.populateUsedSet(); @@ -757,15 +693,15 @@ private: DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n" << Partitions); if (Partitions.getSize() < 2) - return false; + return fail("cannot isolate unsafe dependencies"); } // Don't distribute the loop if we need too many SCEV run-time checks. - const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate(); - if (Pred.getComplexity() > DistributeSCEVCheckThreshold) { - DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n"); - return false; - } + const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate(); + if (Pred.getComplexity() > (IsForced.getValueOr(false) + ? PragmaDistributeSCEVCheckThreshold + : DistributeSCEVCheckThreshold)) + return fail("too many SCEV run-time checks needed.\n"); DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n"); // We're done forming the partitions set up the reverse mapping from @@ -779,19 +715,20 @@ private: SplitBlock(PH, PH->getTerminator(), DT, LI); // If we need run-time checks, version the loop now. - auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI); - const auto *RtPtrChecking = LAI.getRuntimePointerChecking(); + auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); + const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); const auto &AllChecks = RtPtrChecking->getChecks(); auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, RtPtrChecking); if (!Pred.isAlwaysTrue() || !Checks.empty()) { DEBUG(dbgs() << "\nPointers:\n"); - DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); - LoopVersioning LVer(LAI, L, LI, DT, SE, false); + DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); + LoopVersioning LVer(*LAI, L, LI, DT, SE, false); LVer.setAliasChecks(std::move(Checks)); - LVer.setSCEVChecks(LAI.PSE.getUnionPredicate()); + LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate()); LVer.versionLoop(DefsUsedOutside); + LVer.annotateLoopWithNoAlias(); } // Create identical copies of the original loop for each partition and hook @@ -810,27 +747,244 @@ private: } ++NumLoopsDistributed; + // Report the success. + emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(), + "distributed loop"); return true; } + /// \brief Provide diagnostics then \return with false. + bool fail(llvm::StringRef Message) { + LLVMContext &Ctx = F->getContext(); + bool Forced = isForced().getValueOr(false); + + DEBUG(dbgs() << "Skipping; " << Message << "\n"); + + // With Rpass-missed report that distribution failed. + ORE->emitOptimizationRemarkMissed( + LDIST_NAME, L, + "loop not distributed: use -Rpass-analysis=loop-distribute for more " + "info"); + + // With Rpass-analysis report why. This is on by default if distribution + // was requested explicitly. + emitOptimizationRemarkAnalysis( + Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint + : LDIST_NAME, + *F, L->getStartLoc(), Twine("loop not distributed: ") + Message); + + // Also issue a warning if distribution was requested explicitly but it + // failed. + if (Forced) + Ctx.diagnose(DiagnosticInfoOptimizationFailure( + *F, L->getStartLoc(), "loop not distributed: failed " + "explicitly specified loop distribution")); + + return false; + } + + /// \brief Return if distribution forced to be enabled/disabled for the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + const Optional<bool> &isForced() const { return IsForced; } + +private: + /// \brief Filter out checks between pointers from the same partition. + /// + /// \p PtrToPartition contains the partition number for pointers. Partition + /// number -1 means that the pointer is used in multiple partitions. In this + /// case we can't safely omit the check. + SmallVector<RuntimePointerChecking::PointerCheck, 4> + includeOnlyCrossPartitionChecks( + const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks, + const SmallVectorImpl<int> &PtrToPartition, + const RuntimePointerChecking *RtPtrChecking) { + SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; + + std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (unsigned PtrIdx1 : Check.first->Members) + for (unsigned PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1, PtrIdx2)) + return true; + return false; + }); + + return Checks; + } + + /// \brief Check whether the loop metadata is forcing distribution to be + /// enabled/disabled. + void setForced() { + Optional<const MDOperand *> Value = + findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); + if (!Value) + return; + + const MDOperand *Op = *Value; + assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); + IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); + } + + Loop *L; + Function *F; + // Analyses used. LoopInfo *LI; - LoopAccessAnalysis *LAA; + const LoopAccessInfo *LAI; DominatorTree *DT; ScalarEvolution *SE; + OptimizationRemarkEmitter *ORE; + + /// \brief Indicates whether distribution is forced to be enabled/disabled for + /// the loop. + /// + /// If the optional has a value, it indicates whether distribution was forced + /// to be enabled (true) or disabled (false). If the optional has no value + /// distribution was not forced either way. + Optional<bool> IsForced; +}; + +/// Shared implementation between new and old PMs. +static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, + std::function<const LoopAccessInfo &(Loop &)> &GetLAA, + bool ProcessAllLoops) { + // Build up a worklist of inner-loops to vectorize. This is necessary as the + // act of distributing a loop creates new loops and can invalidate iterators + // across the loops. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : *LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE); + + // If distribution was forced for the specific loop to be + // enabled/disabled, follow that. Otherwise use the global flag. + if (LDL.isForced().getValueOr(ProcessAllLoops)) + Changed |= LDL.processLoop(GetLAA); + } + + // Process each loop nest in the function. + return Changed; +} + +/// \brief The pass class. +class LoopDistributeLegacy : public FunctionPass { +public: + /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be + /// performed by default. Pass -enable-loop-distribute={0,1} overrides this + /// default. We use this to keep LoopDistribution off by default when invoked + /// from the optimization pipeline but on when invoked explicitly from opt. + LoopDistributeLegacy(bool ProcessAllLoopsByDefault = true) + : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) { + // The default is set by the caller. + if (EnableLoopDistribute.getNumOccurrences() > 0) + ProcessAllLoops = EnableLoopDistribute; + initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); + std::function<const LoopAccessInfo &(Loop &)> GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); }; + + return runImpl(F, LI, DT, SE, ORE, GetLAA, ProcessAllLoops); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<ScalarEvolutionWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<LoopAccessLegacyAnalysis>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); + } + + static char ID; + +private: + /// \brief Whether distribution should be on in this function. The per-loop + /// pragma can override this. + bool ProcessAllLoops; }; } // anonymous namespace -char LoopDistribute::ID; +PreservedAnalyses LoopDistributePass::run(Function &F, + FunctionAnalysisManager &AM) { + // FIXME: This does not currently match the behavior from the old PM. + // ProcessAllLoops with the old PM defaults to true when invoked from opt and + // false when invoked from the optimization pipeline. + bool ProcessAllLoops = false; + if (EnableLoopDistribute.getNumOccurrences() > 0) + ProcessAllLoops = EnableLoopDistribute; + + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); + std::function<const LoopAccessInfo &(Loop &)> GetLAA = + [&](Loop &L) -> const LoopAccessInfo & { + return LAM.getResult<LoopAccessAnalysis>(L); + }; + + bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA, ProcessAllLoops); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<LoopAnalysis>(); + PA.preserve<DominatorTreeAnalysis>(); + return PA; +} + +char LoopDistributeLegacy::ID; static const char ldist_name[] = "Loop Distribition"; -INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false) +INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, + false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) -INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false) namespace llvm { -FunctionPass *createLoopDistributePass() { return new LoopDistribute(); } +FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) { + return new LoopDistributeLegacy(ProcessAllLoopsByDefault); +} } |