diff options
Diffstat (limited to 'llvm/lib/Transforms/IPO/PassManagerBuilder.cpp')
-rw-r--r-- | llvm/lib/Transforms/IPO/PassManagerBuilder.cpp | 277 |
1 files changed, 157 insertions, 120 deletions
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 068328391dff..aa916345954d 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -27,8 +27,10 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ManagedStatic.h" +#include "llvm/Target/CGPassBuilderOption.h" #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" @@ -42,6 +44,7 @@ #include "llvm/Transforms/Scalar/InstSimplifyPass.h" #include "llvm/Transforms/Scalar/LICM.h" #include "llvm/Transforms/Scalar/LoopUnrollPass.h" +#include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Vectorize.h" @@ -51,6 +54,7 @@ using namespace llvm; +namespace llvm { cl::opt<bool> RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden, cl::ZeroOrMore, cl::desc("Run Partial inlinining pass")); @@ -72,7 +76,6 @@ cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, cl::desc("Run the NewGVN pass")); // Experimental option to use CFL-AA -enum class CFLAAType { None, Steensgaard, Andersen, Both }; static cl::opt<::CFLAAType> UseCFLAA("use-cfl-aa", cl::init(::CFLAAType::None), cl::Hidden, cl::desc("Enable the new, experimental CFL alias analysis"), @@ -84,9 +87,9 @@ static cl::opt<::CFLAAType> clEnumValN(::CFLAAType::Both, "both", "Enable both variants of CFL-AA"))); -static cl::opt<bool> EnableLoopInterchange( +cl::opt<bool> EnableLoopInterchange( "enable-loopinterchange", cl::init(false), cl::Hidden, - cl::desc("Enable the new, experimental LoopInterchange Pass")); + cl::desc("Enable the experimental LoopInterchange Pass")); cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false), cl::Hidden, @@ -96,6 +99,10 @@ cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false), cl::Hidden, cl::desc("Enable the LoopFlatten Pass")); +cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread", + cl::desc("Enable DFA jump threading."), + cl::init(false), cl::Hidden); + static cl::opt<bool> EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, cl::desc("Enable preparation for ThinLTO.")); @@ -165,6 +172,10 @@ cl::opt<bool> EnableConstraintElimination( cl::desc( "Enable pass to eliminate conditions based on linear constraints.")); +cl::opt<bool> EnableFunctionSpecialization( + "enable-function-specialization", cl::init(false), cl::Hidden, + cl::desc("Enable Function Specialization pass")); + cl::opt<AttributorRunOption> AttributorRun( "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE), cl::desc("Enable the attributor inter-procedural deduction pass."), @@ -178,6 +189,7 @@ cl::opt<AttributorRunOption> AttributorRun( "disable attributor runs"))); extern cl::opt<bool> EnableKnowledgeRetention; +} // namespace llvm PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; @@ -300,7 +312,6 @@ void PassManagerBuilder::addInitialAliasAnalysisPasses( void PassManagerBuilder::populateFunctionPassManager( legacy::FunctionPassManager &FPM) { addExtensionsToPM(EP_EarlyAsPossible, FPM); - FPM.add(createEntryExitInstrumenterPass()); // Add LibraryInfo if we have some. if (LibraryInfo) @@ -317,10 +328,12 @@ void PassManagerBuilder::populateFunctionPassManager( addInitialAliasAnalysisPasses(FPM); + // Lower llvm.expect to metadata before attempting transforms. + // Compare/branch metadata may alter the behavior of passes like SimplifyCFG. + FPM.add(createLowerExpectIntrinsicPass()); FPM.add(createCFGSimplificationPass()); FPM.add(createSROAPass()); FPM.add(createEarlyCSEPass()); - FPM.add(createLowerExpectIntrinsicPass()); } // Do PGO instrumentation generation or use pass as the option specified. @@ -432,6 +445,10 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createLoopInstSimplifyPass()); MPM.add(createLoopSimplifyCFGPass()); } + // Try to remove as much code from the loop header as possible, + // to reduce amount of IR that will have to be duplicated. + // TODO: Investigate promotion cap for O1. + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO)); // TODO: Investigate promotion cap for O1. @@ -441,7 +458,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( else MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); // FIXME: We break the loop pass pipeline here in order to do full - // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the + // simplifycfg. Eventually loop-simplifycfg should be enhanced to replace the // need for this. MPM.add(createCFGSimplificationPass()); MPM.add(createInstructionCombiningPass()); @@ -472,7 +489,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies } - MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset MPM.add(createSCCPPass()); // Constant prop with SCCP if (EnableConstraintElimination) @@ -488,11 +504,15 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, MPM); if (OptLevel > 1) { + if (EnableDFAJumpThreading && SizeLevel == 0) + MPM.add(createDFAJumpThreadingPass()); + MPM.add(createJumpThreadingPass()); // Thread jumps MPM.add(createCorrelatedValuePropagationPass()); } MPM.add(createAggressiveDCEPass()); // Delete dead instructions + MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset // TODO: Investigate if this is too expensive at O1. if (OptLevel > 1) { MPM.add(createDeadStoreEliminationPass()); // Delete dead stores @@ -504,7 +524,9 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (RerollLoops) MPM.add(createLoopRerollPass()); - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + // Merge & remove BBs and sink & hoist common instructions. + MPM.add(createCFGSimplificationPass( + SimplifyCFGOptions().hoistCommonInsts(true).sinkCommonInsts(true))); // Clean up after everything. MPM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, MPM); @@ -514,6 +536,120 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createControlHeightReductionLegacyPass()); } +/// FIXME: Should LTO cause any differences to this set of passes? +void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM, + bool IsFullLTO) { + PM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); + + if (IsFullLTO) { + // The vectorizer may have significantly shortened a loop body; unroll + // again. Unroll small loops to hide loop backedge latency and saturate any + // parallel execution resources of an out-of-order processor. We also then + // need to clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && !DisableUnrollLoops) + PM.add(createLoopUnrollAndJamPass(OptLevel)); + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createWarnMissedTransformationsPass()); + } + + if (!IsFullLTO) { + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + PM.add(createLoopLoadEliminationPass()); + } + // Cleanup after the loop optimization passes. + PM.add(createInstructionCombiningPass()); + + if (OptLevel > 1 && ExtraVectorizerPasses) { + // At higher optimization levels, try to clean up any runtime overlap and + // alignment checks inserted by the vectorizer. We want to track correlated + // runtime checks for two inner loops in the same outer loop, fold any + // common computations, hoist loop-invariant aspects out of any outer loop, + // and unswitch the runtime checks if possible. Once hoisted, we may have + // dead (or speculatable) control flows or more combining opportunities. + PM.add(createEarlyCSEPass()); + PM.add(createCorrelatedValuePropagationPass()); + PM.add(createInstructionCombiningPass()); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); + PM.add(createCFGSimplificationPass()); + PM.add(createInstructionCombiningPass()); + } + + // Now that we've formed fast to execute loop structures, we do further + // optimizations. These are run afterward as they might block doing complex + // analyses and transforms such as what are needed for loop vectorization. + + // Cleanup after loop vectorization, etc. Simplification passes like CVP and + // GVN, loop transforms, and others have already run, so it's now better to + // convert to more optimized IR using more aggressive simplify CFG options. + // The extra sinking transform can create larger basic blocks, so do this + // before SLP vectorization. + PM.add(createCFGSimplificationPass(SimplifyCFGOptions() + .forwardSwitchCondToPhi(true) + .convertSwitchToLookupTable(true) + .needCanonicalLoops(false) + .hoistCommonInsts(true) + .sinkCommonInsts(true))); + + if (IsFullLTO) { + PM.add(createSCCPPass()); // Propagate exposed constants + PM.add(createInstructionCombiningPass()); // Clean up again + PM.add(createBitTrackingDCEPass()); + } + + // Optimize parallel scalar instruction chains into SIMD instructions. + if (SLPVectorize) { + PM.add(createSLPVectorizerPass()); + if (OptLevel > 1 && ExtraVectorizerPasses) + PM.add(createEarlyCSEPass()); + } + + // Enhance/cleanup vector code. + PM.add(createVectorCombinePass()); + + if (!IsFullLTO) { + addExtensionsToPM(EP_Peephole, PM); + PM.add(createInstructionCombiningPass()); + + if (EnableUnrollAndJam && !DisableUnrollLoops) { + // Unroll and Jam. We do this before unroll but need to be in a separate + // loop pass manager in order for the outer loop to be processed by + // unroll and jam before the inner loop is unrolled. + PM.add(createLoopUnrollAndJamPass(OptLevel)); + } + + // Unroll small loops + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + + if (!DisableUnrollLoops) { + // LoopUnroll may generate some redundency to cleanup. + PM.add(createInstructionCombiningPass()); + + // Runtime unrolling will introduce runtime check in loop prologue. If the + // unrolled loop is a inner loop, then the prologue will be inside the + // outer loop. LICM pass can help to promote the runtime check out if the + // checked value is loop invariant. + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + } + + PM.add(createWarnMissedTransformationsPass()); + } + + // After vectorization and unrolling, assume intrinsics may tell us more + // about pointer alignments. + PM.add(createAlignmentFromAssumptionsPass()); + + if (IsFullLTO) + PM.add(createInstructionCombiningPass()); +} + void PassManagerBuilder::populateModulePassManager( legacy::PassManagerBase &MPM) { // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link @@ -616,6 +752,10 @@ void PassManagerBuilder::populateModulePassManager( if (OptLevel > 2) MPM.add(createCallSiteSplittingPass()); + // Propage constant function arguments by specializing the functions. + if (OptLevel > 2 && EnableFunctionSpecialization) + MPM.add(createFunctionSpecializationPass()); + MPM.add(createIPSCCPPass()); // IP SCCP MPM.add(createCalledValuePropagationPass()); @@ -663,7 +803,7 @@ void PassManagerBuilder::populateModulePassManager( // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if // there are no OpenMP runtime calls present in the module. if (OptLevel > 1) - MPM.add(createOpenMPOptLegacyPass()); + MPM.add(createOpenMPOptCGSCCLegacyPass()); MPM.add(createPostOrderFunctionAttrsLegacyPass()); if (OptLevel > 2) @@ -785,88 +925,7 @@ void PassManagerBuilder::populateModulePassManager( // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); - MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); - - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - MPM.add(createLoopLoadEliminationPass()); - - // FIXME: Because of #pragma vectorize enable, the passes below are always - // inserted in the pipeline, even when the vectorizer doesn't run (ex. when - // on -O1 and no #pragma is found). Would be good to have these two passes - // as function calls, so that we can only pass them when the vectorizer - // changed the code. - MPM.add(createInstructionCombiningPass()); - if (OptLevel > 1 && ExtraVectorizerPasses) { - // At higher optimization levels, try to clean up any runtime overlap and - // alignment checks inserted by the vectorizer. We want to track correllated - // runtime checks for two inner loops in the same outer loop, fold any - // common computations, hoist loop-invariant aspects out of any outer loop, - // and unswitch the runtime checks if possible. Once hoisted, we may have - // dead (or speculatable) control flows or more combining opportunities. - MPM.add(createEarlyCSEPass()); - MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createInstructionCombiningPass()); - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); - MPM.add(createCFGSimplificationPass()); - MPM.add(createInstructionCombiningPass()); - } - - // Cleanup after loop vectorization, etc. Simplification passes like CVP and - // GVN, loop transforms, and others have already run, so it's now better to - // convert to more optimized IR using more aggressive simplify CFG options. - // The extra sinking transform can create larger basic blocks, so do this - // before SLP vectorization. - // FIXME: study whether hoisting and/or sinking of common instructions should - // be delayed until after SLP vectorizer. - MPM.add(createCFGSimplificationPass(SimplifyCFGOptions() - .forwardSwitchCondToPhi(true) - .convertSwitchToLookupTable(true) - .needCanonicalLoops(false) - .hoistCommonInsts(true) - .sinkCommonInsts(true))); - - if (SLPVectorize) { - MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - if (OptLevel > 1 && ExtraVectorizerPasses) { - MPM.add(createEarlyCSEPass()); - } - } - - // Enhance/cleanup vector code. - MPM.add(createVectorCombinePass()); - - addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createInstructionCombiningPass()); - - if (EnableUnrollAndJam && !DisableUnrollLoops) { - // Unroll and Jam. We do this before unroll but need to be in a separate - // loop pass manager in order for the outer loop to be processed by - // unroll and jam before the inner loop is unrolled. - MPM.add(createLoopUnrollAndJamPass(OptLevel)); - } - - // Unroll small loops - MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - - if (!DisableUnrollLoops) { - // LoopUnroll may generate some redundency to cleanup. - MPM.add(createInstructionCombiningPass()); - - // Runtime unrolling will introduce runtime check in loop prologue. If the - // unrolled loop is a inner loop, then the prologue will be inside the - // outer loop. LICM pass can help to promote the runtime check out if the - // checked value is loop invariant. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); - } - - MPM.add(createWarnMissedTransformationsPass()); - - // After vectorization and unrolling, assume intrinsics may tell us more - // about pointer alignments. - MPM.add(createAlignmentFromAssumptionsPass()); + addVectorPasses(MPM, /* IsFullLTO */ false); // FIXME: We shouldn't bother with this anymore. MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes @@ -952,6 +1011,10 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add( createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty())); + // Propage constant function arguments by specializing the functions. + if (EnableFunctionSpecialization) + PM.add(createFunctionSpecializationPass()); + // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function // pointers passed as arguments to direct uses of functions. @@ -1023,7 +1086,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if // there are no OpenMP runtime calls present in the module. if (OptLevel > 1) - PM.add(createOpenMPOptLegacyPass()); + PM.add(createOpenMPOptCGSCCLegacyPass()); // Optimize globals again if we ran the inliner. if (RunInliner) @@ -1076,35 +1139,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops, ForgetAllSCEVInLoopUnroll)); PM.add(createLoopDistributePass()); - PM.add(createLoopVectorizePass(true, !LoopVectorize)); - // The vectorizer may have significantly shortened a loop body; unroll again. - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - - PM.add(createWarnMissedTransformationsPass()); - - // Now that we've optimized loops (in particular loop induction variables), - // we may have exposed more scalar opportunities. Run parts of the scalar - // optimizer again at this point. - PM.add(createInstructionCombiningPass()); // Initial cleanup - PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert - .hoistCommonInsts(true))); - PM.add(createSCCPPass()); // Propagate exposed constants - PM.add(createInstructionCombiningPass()); // Clean up again - PM.add(createBitTrackingDCEPass()); - - // More scalar chains could be vectorized due to more alias information - if (SLPVectorize) - PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. - - PM.add(createVectorCombinePass()); // Clean up partial vectorization. - - // After vectorization, assume intrinsics may tell us more about pointer - // alignments. - PM.add(createAlignmentFromAssumptionsPass()); - // Cleanup and simplify the code after the scalar optimizations. - PM.add(createInstructionCombiningPass()); + addVectorPasses(PM, /* IsFullLTO */ true); + addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true)); |