aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Scalar/LoopDistribute.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Scalar/LoopDistribute.cpp')
-rw-r--r--lib/Transforms/Scalar/LoopDistribute.cpp398
1 files changed, 276 insertions, 122 deletions
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 3d3cf3e2890b..7eca28ed2bb7 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -22,12 +22,17 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -60,6 +65,19 @@ static cl::opt<unsigned> DistributeSCEVCheckThreshold(
cl::desc("The maximum number of SCEV checks allowed for Loop "
"Distribution"));
+static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
+ "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
+ cl::Hidden,
+ cl::desc(
+ "The maximum number of SCEV checks allowed for Loop "
+ "Distribution for loop marked with #pragma loop distribute(enable)"));
+
+// Note that the initial value for this depends on whether the pass is invoked
+// directly or from the optimization pipeline.
+static cl::opt<bool> EnableLoopDistribute(
+ "enable-loop-distribute", cl::Hidden,
+ cl::desc("Enable the new, experimental LoopDistribution Pass"));
+
STATISTIC(NumLoopsDistributed, "Number of loops distributed");
namespace {
@@ -170,7 +188,7 @@ public:
// Delete the instructions backwards, as it has a reduced likelihood of
// having to update as many def-use and use-def chains.
- for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
+ for (auto *Inst : reverse(Unused)) {
if (!Inst->use_empty())
Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
Inst->eraseFromParent();
@@ -571,121 +589,39 @@ private:
AccessesType Accesses;
};
-/// \brief The pass class.
-class LoopDistribute : public FunctionPass {
+/// \brief The actual class performing the per-loop work.
+class LoopDistributeForLoop {
public:
- LoopDistribute() : FunctionPass(ID) {
- initializeLoopDistributePass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- LAA = &getAnalysis<LoopAccessAnalysis>();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-
- // Build up a worklist of inner-loops to vectorize. This is necessary as the
- // act of distributing a loop creates new loops and can invalidate iterators
- // across the loops.
- SmallVector<Loop *, 8> Worklist;
-
- for (Loop *TopLevelLoop : *LI)
- for (Loop *L : depth_first(TopLevelLoop))
- // We only handle inner-most loops.
- if (L->empty())
- Worklist.push_back(L);
-
- // Now walk the identified inner loops.
- bool Changed = false;
- for (Loop *L : Worklist)
- Changed |= processLoop(L);
-
- // Process each loop nest in the function.
- return Changed;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<LoopAccessAnalysis>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- }
-
- static char ID;
-
-private:
- /// \brief Filter out checks between pointers from the same partition.
- ///
- /// \p PtrToPartition contains the partition number for pointers. Partition
- /// number -1 means that the pointer is used in multiple partitions. In this
- /// case we can't safely omit the check.
- SmallVector<RuntimePointerChecking::PointerCheck, 4>
- includeOnlyCrossPartitionChecks(
- const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
- const SmallVectorImpl<int> &PtrToPartition,
- const RuntimePointerChecking *RtPtrChecking) {
- SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
-
- std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
- [&](const RuntimePointerChecking::PointerCheck &Check) {
- for (unsigned PtrIdx1 : Check.first->Members)
- for (unsigned PtrIdx2 : Check.second->Members)
- // Only include this check if there is a pair of pointers
- // that require checking and the pointers fall into
- // separate partitions.
- //
- // (Note that we already know at this point that the two
- // pointer groups need checking but it doesn't follow
- // that each pair of pointers within the two groups need
- // checking as well.
- //
- // In other words we don't want to include a check just
- // because there is a pair of pointers between the two
- // pointer groups that require checks and a different
- // pair whose pointers fall into different partitions.)
- if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
- !RuntimePointerChecking::arePointersInSamePartition(
- PtrToPartition, PtrIdx1, PtrIdx2))
- return true;
- return false;
- });
-
- return Checks;
+ LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
+ ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
+ : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) {
+ setForced();
}
/// \brief Try to distribute an inner-most loop.
- bool processLoop(Loop *L) {
+ bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
assert(L->empty() && "Only process inner loops.");
DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
<< "\" checking " << *L << "\n");
BasicBlock *PH = L->getLoopPreheader();
- if (!PH) {
- DEBUG(dbgs() << "Skipping; no preheader");
- return false;
- }
- if (!L->getExitBlock()) {
- DEBUG(dbgs() << "Skipping; multiple exit blocks");
- return false;
- }
- // LAA will check that we only have a single exiting block.
+ if (!PH)
+ return fail("no preheader");
+ if (!L->getExitBlock())
+ return fail("multiple exit blocks");
- const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
+ // LAA will check that we only have a single exiting block.
+ LAI = &GetLAA(*L);
// Currently, we only distribute to isolate the part of the loop with
// dependence cycles to enable partial vectorization.
- if (LAI.canVectorizeMemory()) {
- DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
- return false;
- }
- auto *Dependences = LAI.getDepChecker().getDependences();
- if (!Dependences || Dependences->empty()) {
- DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
- return false;
- }
+ if (LAI->canVectorizeMemory())
+ return fail("memory operations are safe for vectorization");
+
+ auto *Dependences = LAI->getDepChecker().getDependences();
+ if (!Dependences || Dependences->empty())
+ return fail("no unsafe dependences to isolate");
InstPartitionContainer Partitions(L, LI, DT);
@@ -708,7 +644,7 @@ private:
// NumUnsafeDependencesActive > 0 indicates this situation and in this case
// we just keep assigning to the same cyclic partition until
// NumUnsafeDependencesActive reaches 0.
- const MemoryDepChecker &DepChecker = LAI.getDepChecker();
+ const MemoryDepChecker &DepChecker = LAI->getDepChecker();
MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
*Dependences);
@@ -738,14 +674,14 @@ private:
DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
- return false;
+ return fail("cannot isolate unsafe dependencies");
// Run the merge heuristics: Merge non-cyclic adjacent partitions since we
// should be able to vectorize these together.
Partitions.mergeBeforePopulating();
DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
if (Partitions.getSize() < 2)
- return false;
+ return fail("cannot isolate unsafe dependencies");
// Now, populate the partitions with non-memory operations.
Partitions.populateUsedSet();
@@ -757,15 +693,15 @@ private:
DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
<< Partitions);
if (Partitions.getSize() < 2)
- return false;
+ return fail("cannot isolate unsafe dependencies");
}
// Don't distribute the loop if we need too many SCEV run-time checks.
- const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
- if (Pred.getComplexity() > DistributeSCEVCheckThreshold) {
- DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
- return false;
- }
+ const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+ if (Pred.getComplexity() > (IsForced.getValueOr(false)
+ ? PragmaDistributeSCEVCheckThreshold
+ : DistributeSCEVCheckThreshold))
+ return fail("too many SCEV run-time checks needed.\n");
DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
// We're done forming the partitions set up the reverse mapping from
@@ -779,19 +715,20 @@ private:
SplitBlock(PH, PH->getTerminator(), DT, LI);
// If we need run-time checks, version the loop now.
- auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
- const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
+ auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
+ const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
const auto &AllChecks = RtPtrChecking->getChecks();
auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
RtPtrChecking);
if (!Pred.isAlwaysTrue() || !Checks.empty()) {
DEBUG(dbgs() << "\nPointers:\n");
- DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
- LoopVersioning LVer(LAI, L, LI, DT, SE, false);
+ DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+ LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
LVer.setAliasChecks(std::move(Checks));
- LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
+ LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
LVer.versionLoop(DefsUsedOutside);
+ LVer.annotateLoopWithNoAlias();
}
// Create identical copies of the original loop for each partition and hook
@@ -810,27 +747,244 @@ private:
}
++NumLoopsDistributed;
+ // Report the success.
+ emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(),
+ "distributed loop");
return true;
}
+ /// \brief Provide diagnostics then \return with false.
+ bool fail(llvm::StringRef Message) {
+ LLVMContext &Ctx = F->getContext();
+ bool Forced = isForced().getValueOr(false);
+
+ DEBUG(dbgs() << "Skipping; " << Message << "\n");
+
+ // With Rpass-missed report that distribution failed.
+ ORE->emitOptimizationRemarkMissed(
+ LDIST_NAME, L,
+ "loop not distributed: use -Rpass-analysis=loop-distribute for more "
+ "info");
+
+ // With Rpass-analysis report why. This is on by default if distribution
+ // was requested explicitly.
+ emitOptimizationRemarkAnalysis(
+ Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint
+ : LDIST_NAME,
+ *F, L->getStartLoc(), Twine("loop not distributed: ") + Message);
+
+ // Also issue a warning if distribution was requested explicitly but it
+ // failed.
+ if (Forced)
+ Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+ *F, L->getStartLoc(), "loop not distributed: failed "
+ "explicitly specified loop distribution"));
+
+ return false;
+ }
+
+ /// \brief Return if distribution forced to be enabled/disabled for the loop.
+ ///
+ /// If the optional has a value, it indicates whether distribution was forced
+ /// to be enabled (true) or disabled (false). If the optional has no value
+ /// distribution was not forced either way.
+ const Optional<bool> &isForced() const { return IsForced; }
+
+private:
+ /// \brief Filter out checks between pointers from the same partition.
+ ///
+ /// \p PtrToPartition contains the partition number for pointers. Partition
+ /// number -1 means that the pointer is used in multiple partitions. In this
+ /// case we can't safely omit the check.
+ SmallVector<RuntimePointerChecking::PointerCheck, 4>
+ includeOnlyCrossPartitionChecks(
+ const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+ const SmallVectorImpl<int> &PtrToPartition,
+ const RuntimePointerChecking *RtPtrChecking) {
+ SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+
+ std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
+ [&](const RuntimePointerChecking::PointerCheck &Check) {
+ for (unsigned PtrIdx1 : Check.first->Members)
+ for (unsigned PtrIdx2 : Check.second->Members)
+ // Only include this check if there is a pair of pointers
+ // that require checking and the pointers fall into
+ // separate partitions.
+ //
+ // (Note that we already know at this point that the two
+ // pointer groups need checking but it doesn't follow
+ // that each pair of pointers within the two groups need
+ // checking as well.
+ //
+ // In other words we don't want to include a check just
+ // because there is a pair of pointers between the two
+ // pointer groups that require checks and a different
+ // pair whose pointers fall into different partitions.)
+ if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+ !RuntimePointerChecking::arePointersInSamePartition(
+ PtrToPartition, PtrIdx1, PtrIdx2))
+ return true;
+ return false;
+ });
+
+ return Checks;
+ }
+
+ /// \brief Check whether the loop metadata is forcing distribution to be
+ /// enabled/disabled.
+ void setForced() {
+ Optional<const MDOperand *> Value =
+ findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
+ if (!Value)
+ return;
+
+ const MDOperand *Op = *Value;
+ assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+ IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+ }
+
+ Loop *L;
+ Function *F;
+
// Analyses used.
LoopInfo *LI;
- LoopAccessAnalysis *LAA;
+ const LoopAccessInfo *LAI;
DominatorTree *DT;
ScalarEvolution *SE;
+ OptimizationRemarkEmitter *ORE;
+
+ /// \brief Indicates whether distribution is forced to be enabled/disabled for
+ /// the loop.
+ ///
+ /// If the optional has a value, it indicates whether distribution was forced
+ /// to be enabled (true) or disabled (false). If the optional has no value
+ /// distribution was not forced either way.
+ Optional<bool> IsForced;
+};
+
+/// Shared implementation between new and old PMs.
+static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
+ ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
+ std::function<const LoopAccessInfo &(Loop &)> &GetLAA,
+ bool ProcessAllLoops) {
+ // Build up a worklist of inner-loops to vectorize. This is necessary as the
+ // act of distributing a loop creates new loops and can invalidate iterators
+ // across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *TopLevelLoop : *LI)
+ for (Loop *L : depth_first(TopLevelLoop))
+ // We only handle inner-most loops.
+ if (L->empty())
+ Worklist.push_back(L);
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ for (Loop *L : Worklist) {
+ LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
+
+ // If distribution was forced for the specific loop to be
+ // enabled/disabled, follow that. Otherwise use the global flag.
+ if (LDL.isForced().getValueOr(ProcessAllLoops))
+ Changed |= LDL.processLoop(GetLAA);
+ }
+
+ // Process each loop nest in the function.
+ return Changed;
+}
+
+/// \brief The pass class.
+class LoopDistributeLegacy : public FunctionPass {
+public:
+ /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
+ /// performed by default. Pass -enable-loop-distribute={0,1} overrides this
+ /// default. We use this to keep LoopDistribution off by default when invoked
+ /// from the optimization pipeline but on when invoked explicitly from opt.
+ LoopDistributeLegacy(bool ProcessAllLoopsByDefault = true)
+ : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
+ // The default is set by the caller.
+ if (EnableLoopDistribute.getNumOccurrences() > 0)
+ ProcessAllLoops = EnableLoopDistribute;
+ initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+ return runImpl(F, LI, DT, SE, ORE, GetLAA, ProcessAllLoops);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+
+ static char ID;
+
+private:
+ /// \brief Whether distribution should be on in this function. The per-loop
+ /// pragma can override this.
+ bool ProcessAllLoops;
};
} // anonymous namespace
-char LoopDistribute::ID;
+PreservedAnalyses LoopDistributePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ // FIXME: This does not currently match the behavior from the old PM.
+ // ProcessAllLoops with the old PM defaults to true when invoked from opt and
+ // false when invoked from the optimization pipeline.
+ bool ProcessAllLoops = false;
+ if (EnableLoopDistribute.getNumOccurrences() > 0)
+ ProcessAllLoops = EnableLoopDistribute;
+
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & {
+ return LAM.getResult<LoopAccessAnalysis>(L);
+ };
+
+ bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA, ProcessAllLoops);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+
+char LoopDistributeLegacy::ID;
static const char ldist_name[] = "Loop Distribition";
-INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
+ false)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
namespace llvm {
-FunctionPass *createLoopDistributePass() { return new LoopDistribute(); }
+FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
+ return new LoopDistributeLegacy(ProcessAllLoopsByDefault);
+}
}