diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
200 files changed, 10067 insertions, 4276 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index a24de3ca213f..59b94567a9c2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp new file mode 100644 index 000000000000..7c5e90cb53cd --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp @@ -0,0 +1,305 @@ +//===-- CFGuard.cpp - Control Flow Guard checks -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the IR transform to add Microsoft's Control Flow Guard +/// checks on Windows targets. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/CFGuard.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +using namespace llvm; + +using OperandBundleDef = OperandBundleDefT<Value *>; + +#define DEBUG_TYPE "cfguard" + +STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added"); + +namespace { + +/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes. +/// These checks ensure that the target address corresponds to the start of an +/// address-taken function. X86_64 targets use the CF_Dispatch mechanism. X86, +/// ARM, and AArch64 targets use the CF_Check machanism. +class CFGuard : public FunctionPass { +public: + static char ID; + + enum Mechanism { CF_Check, CF_Dispatch }; + + // Default constructor required for the INITIALIZE_PASS macro. + CFGuard() : FunctionPass(ID) { + initializeCFGuardPass(*PassRegistry::getPassRegistry()); + // By default, use the guard check mechanism. + GuardMechanism = CF_Check; + } + + // Recommended constructor used to specify the type of guard mechanism. + CFGuard(Mechanism Var) : FunctionPass(ID) { + initializeCFGuardPass(*PassRegistry::getPassRegistry()); + GuardMechanism = Var; + } + + /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG + /// check mechanism. When the image is loaded, the loader puts the appropriate + /// guard check function pointer in the __guard_check_icall_fptr global + /// symbol. This checks that the target address is a valid address-taken + /// function. The address of the target function is passed to the guard check + /// function in an architecture-specific register (e.g. ECX on 32-bit X86, + /// X15 on Aarch64, and R0 on ARM). The guard check function has no return + /// value (if the target is invalid, the guard check funtion will raise an + /// error). + /// + /// For example, the following LLVM IR: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = call i32 %0() + /// \endcode + /// + /// is transformed to: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = load void (i8*)*, void (i8*)** @__guard_check_icall_fptr + /// %2 = bitcast i32 ()* %0 to i8* + /// call cfguard_checkcc void %1(i8* %2) + /// %3 = call i32 %0() + /// \endcode + /// + /// For example, the following X86 assembly code: + /// \code + /// movl $_target_func, %eax + /// calll *%eax + /// \endcode + /// + /// is transformed to: + /// \code + /// movl $_target_func, %ecx + /// calll *___guard_check_icall_fptr + /// calll *%ecx + /// \endcode + /// + /// \param CB indirect call to instrument. + void insertCFGuardCheck(CallBase *CB); + + /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG + /// dispatch mechanism. When the image is loaded, the loader puts the + /// appropriate guard check function pointer in the + /// __guard_dispatch_icall_fptr global symbol. This checks that the target + /// address is a valid address-taken function and, if so, tail calls the + /// target. The target address is passed in an architecture-specific register + /// (e.g. RAX on X86_64), with all other arguments for the target function + /// passed as usual. + /// + /// For example, the following LLVM IR: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = call i32 %0() + /// \endcode + /// + /// is transformed to: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = load i32 ()*, i32 ()** @__guard_dispatch_icall_fptr + /// %2 = call i32 %1() [ "cfguardtarget"(i32 ()* %0) ] + /// \endcode + /// + /// For example, the following X86_64 assembly code: + /// \code + /// leaq target_func(%rip), %rax + /// callq *%rax + /// \endcode + /// + /// is transformed to: + /// \code + /// leaq target_func(%rip), %rax + /// callq *__guard_dispatch_icall_fptr(%rip) + /// \endcode + /// + /// \param CB indirect call to instrument. + void insertCFGuardDispatch(CallBase *CB); + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + +private: + // Only add checks if the module has the cfguard=2 flag. + int cfguard_module_flag = 0; + Mechanism GuardMechanism = CF_Check; + FunctionType *GuardFnType = nullptr; + PointerType *GuardFnPtrType = nullptr; + Constant *GuardFnGlobal = nullptr; +}; + +} // end anonymous namespace + +void CFGuard::insertCFGuardCheck(CallBase *CB) { + + assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() && + "Only applicable for Windows targets"); + assert(CB->isIndirectCall() && + "Control Flow Guard checks can only be added to indirect calls"); + + IRBuilder<> B(CB); + Value *CalledOperand = CB->getCalledOperand(); + + // Load the global symbol as a pointer to the check function. + LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal); + + // Create new call instruction. The CFGuard check should always be a call, + // even if the original CallBase is an Invoke or CallBr instruction. + CallInst *GuardCheck = + B.CreateCall(GuardFnType, GuardCheckLoad, + {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}); + + // Ensure that the first argument is passed in the correct register + // (e.g. ECX on 32-bit X86 targets). + GuardCheck->setCallingConv(CallingConv::CFGuard_Check); +} + +void CFGuard::insertCFGuardDispatch(CallBase *CB) { + + assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() && + "Only applicable for Windows targets"); + assert(CB->isIndirectCall() && + "Control Flow Guard checks can only be added to indirect calls"); + + IRBuilder<> B(CB); + Value *CalledOperand = CB->getCalledOperand(); + Type *CalledOperandType = CalledOperand->getType(); + + // Cast the guard dispatch global to the type of the called operand. + PointerType *PTy = PointerType::get(CalledOperandType, 0); + if (GuardFnGlobal->getType() != PTy) + GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy); + + // Load the global as a pointer to a function of the same type. + LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal); + + // Add the original call target as a cfguardtarget operand bundle. + SmallVector<llvm::OperandBundleDef, 1> Bundles; + CB->getOperandBundlesAsDefs(Bundles); + Bundles.emplace_back("cfguardtarget", CalledOperand); + + // Create a copy of the call/invoke instruction and add the new bundle. + CallBase *NewCB; + if (CallInst *CI = dyn_cast<CallInst>(CB)) { + NewCB = CallInst::Create(CI, Bundles, CB); + } else { + assert(isa<InvokeInst>(CB) && "Unknown indirect call type"); + InvokeInst *II = cast<InvokeInst>(CB); + NewCB = llvm::InvokeInst::Create(II, Bundles, CB); + } + + // Change the target of the call to be the guard dispatch function. + NewCB->setCalledOperand(GuardDispatchLoad); + + // Replace the original call/invoke with the new instruction. + CB->replaceAllUsesWith(NewCB); + + // Delete the original call/invoke. + CB->eraseFromParent(); +} + +bool CFGuard::doInitialization(Module &M) { + + // Check if this module has the cfguard flag and read its value. + if (auto *MD = + mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard"))) + cfguard_module_flag = MD->getZExtValue(); + + // Skip modules for which CFGuard checks have been disabled. + if (cfguard_module_flag != 2) + return false; + + // Set up prototypes for the guard check and dispatch functions. + GuardFnType = FunctionType::get(Type::getVoidTy(M.getContext()), + {Type::getInt8PtrTy(M.getContext())}, false); + GuardFnPtrType = PointerType::get(GuardFnType, 0); + + // Get or insert the guard check or dispatch global symbols. + if (GuardMechanism == CF_Check) { + GuardFnGlobal = + M.getOrInsertGlobal("__guard_check_icall_fptr", GuardFnPtrType); + } else { + assert(GuardMechanism == CF_Dispatch && "Invalid CFGuard mechanism"); + GuardFnGlobal = + M.getOrInsertGlobal("__guard_dispatch_icall_fptr", GuardFnPtrType); + } + + return true; +} + +bool CFGuard::runOnFunction(Function &F) { + + // Skip modules for which CFGuard checks have been disabled. + if (cfguard_module_flag != 2) + return false; + + SmallVector<CallBase *, 8> IndirectCalls; + + // Iterate over the instructions to find all indirect call/invoke/callbr + // instructions. Make a separate list of pointers to indirect + // call/invoke/callbr instructions because the original instructions will be + // deleted as the checks are added. + for (BasicBlock &BB : F.getBasicBlockList()) { + for (Instruction &I : BB.getInstList()) { + auto *CB = dyn_cast<CallBase>(&I); + if (CB && CB->isIndirectCall() && !CB->hasFnAttr("guard_nocf")) { + IndirectCalls.push_back(CB); + CFGuardCounter++; + } + } + } + + // If no checks are needed, return early. + if (IndirectCalls.empty()) { + return false; + } + + // For each indirect call/invoke, add the appropriate dispatch or check. + if (GuardMechanism == CF_Dispatch) { + for (CallBase *CB : IndirectCalls) { + insertCFGuardDispatch(CB); + } + } else { + for (CallBase *CB : IndirectCalls) { + insertCFGuardCheck(CB); + } + } + + return true; +} + +char CFGuard::ID = 0; +INITIALIZE_PASS(CFGuard, "CFGuard", "CFGuard", false, false) + +FunctionPass *llvm::createCFGuardCheckPass() { + return new CFGuard(CFGuard::CF_Check); +} + +FunctionPass *llvm::createCFGuardDispatchPass() { + return new CFGuard(CFGuard::CF_Dispatch); +}
\ No newline at end of file diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp index c3e05577f044..c2dbd6f41642 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -99,11 +99,11 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) { namespace { -struct CoroCleanup : FunctionPass { +struct CoroCleanupLegacy : FunctionPass { static char ID; // Pass identification, replacement for typeid - CoroCleanup() : FunctionPass(ID) { - initializeCoroCleanupPass(*PassRegistry::getPassRegistry()); + CoroCleanupLegacy() : FunctionPass(ID) { + initializeCoroCleanupLegacyPass(*PassRegistry::getPassRegistry()); } std::unique_ptr<Lowerer> L; @@ -132,8 +132,8 @@ struct CoroCleanup : FunctionPass { }; } -char CoroCleanup::ID = 0; -INITIALIZE_PASS(CoroCleanup, "coro-cleanup", +char CoroCleanupLegacy::ID = 0; +INITIALIZE_PASS(CoroCleanupLegacy, "coro-cleanup", "Lower all coroutine related intrinsics", false, false) -Pass *llvm::createCoroCleanupPass() { return new CoroCleanup(); } +Pass *llvm::createCoroCleanupLegacyPass() { return new CoroCleanupLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp index 55993d33ee4e..e73fb9eeb1e9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -22,7 +22,7 @@ using namespace llvm; #define DEBUG_TYPE "coro-early" namespace { -// Created on demand if CoroEarly pass has work to do. +// Created on demand if the coro-early pass has work to do. class Lowerer : public coro::LowererBase { IRBuilder<> Builder; PointerType *const AnyResumeFnPtrTy; @@ -225,10 +225,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) { namespace { -struct CoroEarly : public FunctionPass { +struct CoroEarlyLegacy : public FunctionPass { static char ID; // Pass identification, replacement for typeid. - CoroEarly() : FunctionPass(ID) { - initializeCoroEarlyPass(*PassRegistry::getPassRegistry()); + CoroEarlyLegacy() : FunctionPass(ID) { + initializeCoroEarlyLegacyPass(*PassRegistry::getPassRegistry()); } std::unique_ptr<Lowerer> L; @@ -267,8 +267,8 @@ struct CoroEarly : public FunctionPass { }; } -char CoroEarly::ID = 0; -INITIALIZE_PASS(CoroEarly, "coro-early", "Lower early coroutine intrinsics", - false, false) +char CoroEarlyLegacy::ID = 0; +INITIALIZE_PASS(CoroEarlyLegacy, "coro-early", + "Lower early coroutine intrinsics", false, false) -Pass *llvm::createCoroEarlyPass() { return new CoroEarly(); } +Pass *llvm::createCoroEarlyLegacyPass() { return new CoroEarlyLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp index aca77119023b..23d22e23861a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" @@ -23,7 +24,7 @@ using namespace llvm; #define DEBUG_TYPE "coro-elide" namespace { -// Created on demand if CoroElide pass has work to do. +// Created on demand if the coro-elide pass has work to do. struct Lowerer : coro::LowererBase { SmallVector<CoroIdInst *, 4> CoroIds; SmallVector<CoroBeginInst *, 1> CoroBegins; @@ -276,10 +277,10 @@ static bool replaceDevirtTrigger(Function &F) { //===----------------------------------------------------------------------===// namespace { -struct CoroElide : FunctionPass { +struct CoroElideLegacy : FunctionPass { static char ID; - CoroElide() : FunctionPass(ID) { - initializeCoroElidePass(*PassRegistry::getPassRegistry()); + CoroElideLegacy() : FunctionPass(ID) { + initializeCoroElideLegacyPass(*PassRegistry::getPassRegistry()); } std::unique_ptr<Lowerer> L; @@ -329,15 +330,15 @@ struct CoroElide : FunctionPass { }; } -char CoroElide::ID = 0; +char CoroElideLegacy::ID = 0; INITIALIZE_PASS_BEGIN( - CoroElide, "coro-elide", + CoroElideLegacy, "coro-elide", "Coroutine frame allocation elision and indirect calls replacement", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END( - CoroElide, "coro-elide", + CoroElideLegacy, "coro-elide", "Coroutine frame allocation elision and indirect calls replacement", false, false) -Pass *llvm::createCoroElidePass() { return new CoroElide(); } +Pass *llvm::createCoroElideLegacyPass() { return new CoroElideLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h index c151474316f9..7eb35400c0d5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -21,10 +21,10 @@ class CallGraph; class CallGraphSCC; class PassRegistry; -void initializeCoroEarlyPass(PassRegistry &); -void initializeCoroSplitPass(PassRegistry &); -void initializeCoroElidePass(PassRegistry &); -void initializeCoroCleanupPass(PassRegistry &); +void initializeCoroEarlyLegacyPass(PassRegistry &); +void initializeCoroSplitLegacyPass(PassRegistry &); +void initializeCoroElideLegacyPass(PassRegistry &); +void initializeCoroCleanupLegacyPass(PassRegistry &); // CoroEarly pass marks every function that has coro.begin with a string // attribute "coroutine.presplit"="0". CoroSplit pass processes the coroutine @@ -43,7 +43,8 @@ void initializeCoroCleanupPass(PassRegistry &); namespace coro { -bool declaresIntrinsics(Module &M, std::initializer_list<StringRef>); +bool declaresIntrinsics(const Module &M, + const std::initializer_list<StringRef>); void replaceAllCoroAllocs(CoroBeginInst *CB, bool Replacement); void replaceAllCoroFrees(CoroBeginInst *CB, Value *Replacement); void replaceCoroFree(CoroIdInst *CoroId, bool Elide); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 04723cbde417..66cb3e74e53e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -27,7 +27,6 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -52,6 +51,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -60,6 +60,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <cassert> #include <cstddef> @@ -157,8 +158,9 @@ private: } // end anonymous namespace -static void maybeFreeRetconStorage(IRBuilder<> &Builder, coro::Shape &Shape, - Value *FramePtr, CallGraph *CG) { +static void maybeFreeRetconStorage(IRBuilder<> &Builder, + const coro::Shape &Shape, Value *FramePtr, + CallGraph *CG) { assert(Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce); if (Shape.RetconLowering.IsFrameInlineInStorage) @@ -168,9 +170,9 @@ static void maybeFreeRetconStorage(IRBuilder<> &Builder, coro::Shape &Shape, } /// Replace a non-unwind call to llvm.coro.end. -static void replaceFallthroughCoroEnd(CoroEndInst *End, coro::Shape &Shape, - Value *FramePtr, bool InResume, - CallGraph *CG) { +static void replaceFallthroughCoroEnd(CoroEndInst *End, + const coro::Shape &Shape, Value *FramePtr, + bool InResume, CallGraph *CG) { // Start inserting right before the coro.end. IRBuilder<> Builder(End); @@ -218,7 +220,7 @@ static void replaceFallthroughCoroEnd(CoroEndInst *End, coro::Shape &Shape, } /// Replace an unwind call to llvm.coro.end. -static void replaceUnwindCoroEnd(CoroEndInst *End, coro::Shape &Shape, +static void replaceUnwindCoroEnd(CoroEndInst *End, const coro::Shape &Shape, Value *FramePtr, bool InResume, CallGraph *CG){ IRBuilder<> Builder(End); @@ -245,7 +247,7 @@ static void replaceUnwindCoroEnd(CoroEndInst *End, coro::Shape &Shape, } } -static void replaceCoroEnd(CoroEndInst *End, coro::Shape &Shape, +static void replaceCoroEnd(CoroEndInst *End, const coro::Shape &Shape, Value *FramePtr, bool InResume, CallGraph *CG) { if (End->isUnwind()) replaceUnwindCoroEnd(End, Shape, FramePtr, InResume, CG); @@ -781,7 +783,7 @@ static Function *createClone(Function &F, const Twine &Suffix, } /// Remove calls to llvm.coro.end in the original function. -static void removeCoroEnds(coro::Shape &Shape, CallGraph *CG) { +static void removeCoroEnds(const coro::Shape &Shape, CallGraph *CG) { for (auto End : Shape.CoroEnds) { replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, CG); } @@ -906,17 +908,29 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock, // values and select the correct case successor when possible. static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) { DenseMap<Value *, Value *> ResolvedValues; + BasicBlock *UnconditionalSucc = nullptr; Instruction *I = InitialInst; while (I->isTerminator()) { if (isa<ReturnInst>(I)) { - if (I != InitialInst) + if (I != InitialInst) { + // If InitialInst is an unconditional branch, + // remove PHI values that come from basic block of InitialInst + if (UnconditionalSucc) + for (PHINode &PN : UnconditionalSucc->phis()) { + int idx = PN.getBasicBlockIndex(InitialInst->getParent()); + if (idx != -1) + PN.removeIncomingValue(idx); + } ReplaceInstWithInst(InitialInst, I->clone()); + } return true; } if (auto *BR = dyn_cast<BranchInst>(I)) { if (BR->isUnconditional()) { BasicBlock *BB = BR->getSuccessor(0); + if (I == InitialInst) + UnconditionalSucc = BB; scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); I = BB->getFirstNonPHIOrDbgOrLifetime(); continue; @@ -1407,9 +1421,10 @@ static void prepareForSplit(Function &F, CallGraph &CG) { CG[&F]->addCalledFunction(IndirectCall, CG.getCallsExternalNode()); } -// Make sure that there is a devirtualization trigger function that CoroSplit -// pass uses the force restart CGSCC pipeline. If devirt trigger function is not -// found, we will create one and add it to the current SCC. +// Make sure that there is a devirtualization trigger function that the +// coro-split pass uses to force a restart of the CGSCC pipeline. If the devirt +// trigger function is not found, we will create one and add it to the current +// SCC. static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) { Module &M = CG.getModule(); if (M.getFunction(CORO_DEVIRT_TRIGGER_FN)) @@ -1512,11 +1527,11 @@ static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) { namespace { -struct CoroSplit : public CallGraphSCCPass { +struct CoroSplitLegacy : public CallGraphSCCPass { static char ID; // Pass identification, replacement for typeid - CoroSplit() : CallGraphSCCPass(ID) { - initializeCoroSplitPass(*PassRegistry::getPassRegistry()); + CoroSplitLegacy() : CallGraphSCCPass(ID) { + initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry()); } bool Run = false; @@ -1586,16 +1601,16 @@ struct CoroSplit : public CallGraphSCCPass { } // end anonymous namespace -char CoroSplit::ID = 0; +char CoroSplitLegacy::ID = 0; INITIALIZE_PASS_BEGIN( - CoroSplit, "coro-split", + CoroSplitLegacy, "coro-split", "Split coroutine into a set of functions driving its state machine", false, false) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_END( - CoroSplit, "coro-split", + CoroSplitLegacy, "coro-split", "Split coroutine into a set of functions driving its state machine", false, false) -Pass *llvm::createCoroSplitPass() { return new CoroSplit(); } +Pass *llvm::createCoroSplitLegacyPass() { return new CoroSplitLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp index f39483b27518..02d11af3303f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -11,14 +11,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Coroutines.h" -#include "llvm-c/Transforms/Coroutines.h" #include "CoroInstr.h" #include "CoroInternal.h" +#include "llvm-c/Transforms/Coroutines.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -31,10 +30,12 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstddef> #include <utility> @@ -42,39 +43,39 @@ using namespace llvm; void llvm::initializeCoroutines(PassRegistry &Registry) { - initializeCoroEarlyPass(Registry); - initializeCoroSplitPass(Registry); - initializeCoroElidePass(Registry); - initializeCoroCleanupPass(Registry); + initializeCoroEarlyLegacyPass(Registry); + initializeCoroSplitLegacyPass(Registry); + initializeCoroElideLegacyPass(Registry); + initializeCoroCleanupLegacyPass(Registry); } static void addCoroutineOpt0Passes(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroSplitPass()); - PM.add(createCoroElidePass()); + PM.add(createCoroSplitLegacyPass()); + PM.add(createCoroElideLegacyPass()); PM.add(createBarrierNoopPass()); - PM.add(createCoroCleanupPass()); + PM.add(createCoroCleanupLegacyPass()); } static void addCoroutineEarlyPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroEarlyPass()); + PM.add(createCoroEarlyLegacyPass()); } static void addCoroutineScalarOptimizerPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroElidePass()); + PM.add(createCoroElideLegacyPass()); } static void addCoroutineSCCPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroSplitPass()); + PM.add(createCoroSplitLegacyPass()); } static void addCoroutineOptimizerLastPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroCleanupPass()); + PM.add(createCoroCleanupLegacyPass()); } void llvm::addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder) { @@ -150,8 +151,8 @@ static bool isCoroutineIntrinsicName(StringRef Name) { // Verifies if a module has named values listed. Also, in debug mode verifies // that names are intrinsic names. -bool coro::declaresIntrinsics(Module &M, - std::initializer_list<StringRef> List) { +bool coro::declaresIntrinsics(const Module &M, + const std::initializer_list<StringRef> List) { for (StringRef Name : List) { assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic"); if (M.getNamedValue(Name)) @@ -634,17 +635,17 @@ void AnyCoroIdRetconInst::checkWellFormed() const { } void LLVMAddCoroEarlyPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroEarlyPass()); + unwrap(PM)->add(createCoroEarlyLegacyPass()); } void LLVMAddCoroSplitPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroSplitPass()); + unwrap(PM)->add(createCoroSplitLegacyPass()); } void LLVMAddCoroElidePass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroElidePass()); + unwrap(PM)->add(createCoroElideLegacyPass()); } void LLVMAddCoroCleanupPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroCleanupPass()); + unwrap(PM)->add(createCoroCleanupLegacyPass()); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp index c50805692b98..06d1763353f4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/Utils/Cloning.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index dd9f74a881ee..cdf8a2eb598e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -70,6 +70,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -386,8 +387,9 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr, - I->getParamAlignment(), "", InsertPt); + Value *TheAlloca = + new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr, + MaybeAlign(I->getParamAlignment()), "", InsertPt); StructType *STy = cast<StructType>(AgTy); Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr}; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp index 95f47345d8fd..f2995817eaf8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp @@ -23,14 +23,18 @@ #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -53,6 +57,8 @@ STATISTIC(NumAttributesValidFixpoint, "Number of abstract attributes in a valid fixpoint state"); STATISTIC(NumAttributesManifested, "Number of abstract attributes manifested in IR"); +STATISTIC(NumAttributesFixedDueToRequiredDependences, + "Number of abstract attributes fixed due to required dependences"); // Some helper macros to deal with statistics tracking. // @@ -98,6 +104,36 @@ STATISTIC(NumAttributesManifested, STATS_DECLTRACK(NAME, Floating, \ ("Number of floating values known to be '" #NAME "'")) +// Specialization of the operator<< for abstract attributes subclasses. This +// disambiguates situations where multiple operators are applicable. +namespace llvm { +#define PIPE_OPERATOR(CLASS) \ + raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) { \ + return OS << static_cast<const AbstractAttribute &>(AA); \ + } + +PIPE_OPERATOR(AAIsDead) +PIPE_OPERATOR(AANoUnwind) +PIPE_OPERATOR(AANoSync) +PIPE_OPERATOR(AANoRecurse) +PIPE_OPERATOR(AAWillReturn) +PIPE_OPERATOR(AANoReturn) +PIPE_OPERATOR(AAReturnedValues) +PIPE_OPERATOR(AANonNull) +PIPE_OPERATOR(AANoAlias) +PIPE_OPERATOR(AADereferenceable) +PIPE_OPERATOR(AAAlign) +PIPE_OPERATOR(AANoCapture) +PIPE_OPERATOR(AAValueSimplify) +PIPE_OPERATOR(AANoFree) +PIPE_OPERATOR(AAHeapToStack) +PIPE_OPERATOR(AAReachability) +PIPE_OPERATOR(AAMemoryBehavior) +PIPE_OPERATOR(AAValueConstantRange) + +#undef PIPE_OPERATOR +} // namespace llvm + // TODO: Determine a good default value. // // In the LLVM-TS and SPEC2006, 32 seems to not induce compile time overheads @@ -120,6 +156,10 @@ static cl::opt<bool> DisableAttributor( cl::desc("Disable the attributor inter-procedural deduction pass."), cl::init(true)); +static cl::opt<bool> AnnotateDeclarationCallSites( + "attributor-annotate-decl-cs", cl::Hidden, + cl::desc("Annotate call sites of function declarations."), cl::init(false)); + static cl::opt<bool> ManifestInternal( "attributor-manifest-internal", cl::Hidden, cl::desc("Manifest Attributor internal string attributes."), @@ -147,6 +187,74 @@ ChangeStatus llvm::operator&(ChangeStatus l, ChangeStatus r) { } ///} +Argument *IRPosition::getAssociatedArgument() const { + if (getPositionKind() == IRP_ARGUMENT) + return cast<Argument>(&getAnchorValue()); + + // Not an Argument and no argument number means this is not a call site + // argument, thus we cannot find a callback argument to return. + int ArgNo = getArgNo(); + if (ArgNo < 0) + return nullptr; + + // Use abstract call sites to make the connection between the call site + // values and the ones in callbacks. If a callback was found that makes use + // of the underlying call site operand, we want the corresponding callback + // callee argument and not the direct callee argument. + Optional<Argument *> CBCandidateArg; + SmallVector<const Use *, 4> CBUses; + ImmutableCallSite ICS(&getAnchorValue()); + AbstractCallSite::getCallbackUses(ICS, CBUses); + for (const Use *U : CBUses) { + AbstractCallSite ACS(U); + assert(ACS && ACS.isCallbackCall()); + if (!ACS.getCalledFunction()) + continue; + + for (unsigned u = 0, e = ACS.getNumArgOperands(); u < e; u++) { + + // Test if the underlying call site operand is argument number u of the + // callback callee. + if (ACS.getCallArgOperandNo(u) != ArgNo) + continue; + + assert(ACS.getCalledFunction()->arg_size() > u && + "ACS mapped into var-args arguments!"); + if (CBCandidateArg.hasValue()) { + CBCandidateArg = nullptr; + break; + } + CBCandidateArg = ACS.getCalledFunction()->getArg(u); + } + } + + // If we found a unique callback candidate argument, return it. + if (CBCandidateArg.hasValue() && CBCandidateArg.getValue()) + return CBCandidateArg.getValue(); + + // If no callbacks were found, or none used the underlying call site operand + // exclusively, use the direct callee argument if available. + const Function *Callee = ICS.getCalledFunction(); + if (Callee && Callee->arg_size() > unsigned(ArgNo)) + return Callee->getArg(ArgNo); + + return nullptr; +} + +/// For calls (and invokes) we will only replace instruction uses to not disturb +/// the old style call graph. +/// TODO: Remove this once we get rid of the old PM. +static void replaceAllInstructionUsesWith(Value &Old, Value &New) { + if (!isa<CallBase>(Old)) + return Old.replaceAllUsesWith(&New); + SmallVector<Use *, 8> Uses; + for (Use &U : Old.uses()) + if (isa<Instruction>(U.getUser())) + Uses.push_back(&U); + for (Use *U : Uses) + U->set(&New); +} + /// Recursively visit all values that might become \p IRP at some point. This /// will be done by looking through cast instructions, selects, phis, and calls /// with the "returned" attribute. Once we cannot look through the value any @@ -234,7 +342,7 @@ static bool genericValueTraversal( // If we actually used liveness information so we have to record a dependence. if (AnyDead) - A.recordDependence(*LivenessAA, QueryingAA); + A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL); // All values have been visited. return true; @@ -282,34 +390,18 @@ static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr, llvm_unreachable("Expected enum or string attribute!"); } -static const Value *getPointerOperand(const Instruction *I) { - if (auto *LI = dyn_cast<LoadInst>(I)) - if (!LI->isVolatile()) - return LI->getPointerOperand(); - if (auto *SI = dyn_cast<StoreInst>(I)) - if (!SI->isVolatile()) - return SI->getPointerOperand(); - - if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I)) - if (!CXI->isVolatile()) - return CXI->getPointerOperand(); - - if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) - if (!RMWI->isVolatile()) - return RMWI->getPointerOperand(); - - return nullptr; -} -static const Value *getBasePointerOfAccessPointerOperand(const Instruction *I, - int64_t &BytesOffset, - const DataLayout &DL) { - const Value *Ptr = getPointerOperand(I); +static const Value * +getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset, + const DataLayout &DL, + bool AllowNonInbounds = false) { + const Value *Ptr = + Attributor::getPointerOperand(I, /* AllowVolatile */ false); if (!Ptr) return nullptr; return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL, - /*AllowNonInbounds*/ false); + AllowNonInbounds); } ChangeStatus AbstractAttribute::update(Attributor &A) { @@ -328,7 +420,7 @@ ChangeStatus AbstractAttribute::update(Attributor &A) { } ChangeStatus -IRAttributeManifest::manifestAttrs(Attributor &A, IRPosition &IRP, +IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP, const ArrayRef<Attribute> &DeducedAttrs) { Function *ScopeFn = IRP.getAssociatedFunction(); IRPosition::Kind PK = IRP.getPositionKind(); @@ -457,13 +549,20 @@ bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs, } void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs, - SmallVectorImpl<Attribute> &Attrs) const { - for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) + SmallVectorImpl<Attribute> &Attrs, + bool IgnoreSubsumingPositions) const { + for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) { for (Attribute::AttrKind AK : AKs) { const Attribute &Attr = EquivIRP.getAttr(AK); if (Attr.getKindAsEnum() == AK) Attrs.push_back(Attr); } + // The first position returned by the SubsumingPositionIterator is + // always the position itself. If we ignore subsuming positions we + // are done after the first iteration. + if (IgnoreSubsumingPositions) + break; + } } void IRPosition::verify() { @@ -517,38 +616,24 @@ void IRPosition::verify() { } namespace { -/// Helper functions to clamp a state \p S of type \p StateType with the +/// Helper function to clamp a state \p S of type \p StateType with the /// information in \p R and indicate/return if \p S did change (as-in update is /// required to be run again). -/// -///{ template <typename StateType> -ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R); - -template <> -ChangeStatus clampStateAndIndicateChange<IntegerState>(IntegerState &S, - const IntegerState &R) { +ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) { auto Assumed = S.getAssumed(); S ^= R; return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } -template <> -ChangeStatus clampStateAndIndicateChange<BooleanState>(BooleanState &S, - const BooleanState &R) { - return clampStateAndIndicateChange<IntegerState>(S, R); -} -///} - /// Clamp the information known for all returned values of a function /// (identified by \p QueryingAA) into \p S. template <typename AAType, typename StateType = typename AAType::StateType> static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA, StateType &S) { LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for " - << static_cast<const AbstractAttribute &>(QueryingAA) - << " into " << S << "\n"); + << QueryingAA << " into " << S << "\n"); assert((QueryingAA.getIRPosition().getPositionKind() == IRPosition::IRP_RETURNED || @@ -593,7 +678,8 @@ struct AAComposeTwoGenericDeduction /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { - ChangeStatus ChangedF = F<AAType, G<AAType, Base, StateType>, StateType>::updateImpl(A); + ChangeStatus ChangedF = + F<AAType, G<AAType, Base, StateType>, StateType>::updateImpl(A); ChangeStatus ChangedG = G<AAType, Base, StateType>::updateImpl(A); return ChangedF | ChangedG; } @@ -621,8 +707,7 @@ template <typename AAType, typename StateType = typename AAType::StateType> static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA, StateType &S) { LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for " - << static_cast<const AbstractAttribute &>(QueryingAA) - << " into " << S << "\n"); + << QueryingAA << " into " << S << "\n"); assert(QueryingAA.getIRPosition().getPositionKind() == IRPosition::IRP_ARGUMENT && @@ -718,7 +803,7 @@ struct AAFromMustBeExecutedContext : public Base { void initialize(Attributor &A) override { Base::initialize(A); - IRPosition &IRP = this->getIRPosition(); + const IRPosition &IRP = this->getIRPosition(); Instruction *CtxI = IRP.getCtxI(); if (!CtxI) @@ -739,21 +824,16 @@ struct AAFromMustBeExecutedContext : public Base { MustBeExecutedContextExplorer &Explorer = A.getInfoCache().getMustBeExecutedContextExplorer(); - SetVector<const Use *> NextUses; - - for (const Use *U : Uses) { + auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI); + for (unsigned u = 0; u < Uses.size(); ++u) { + const Use *U = Uses[u]; if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) { - auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI); - bool Found = EIt.count(UserI); - while (!Found && ++EIt != EEnd) - Found = EIt.getCurrentInst() == UserI; + bool Found = Explorer.findInContextOf(UserI, EIt, EEnd); if (Found && Base::followUse(A, U, UserI)) for (const Use &Us : UserI->uses()) - NextUses.insert(&Us); + Uses.insert(&Us); } } - for (const Use *U : NextUses) - Uses.insert(U); return BeforeState == S ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } @@ -994,13 +1074,15 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { auto ReplaceCallSiteUsersWith = [](CallBase &CB, Constant &C) { if (CB.getNumUses() == 0 || CB.isMustTailCall()) return ChangeStatus::UNCHANGED; - CB.replaceAllUsesWith(&C); + replaceAllInstructionUsesWith(CB, C); return ChangeStatus::CHANGED; }; // If the assumed unique return value is an argument, annotate it. if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) { - getIRPosition() = IRPosition::argument(*UniqueRVArg); + // TODO: This should be handled differently! + this->AnchorVal = UniqueRVArg; + this->KindOrArgNo = UniqueRVArg->getArgNo(); Changed = IRAttribute::manifest(A); } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) { // We can replace the returned value with the unique returned constant. @@ -1010,14 +1092,18 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { if (CallBase *CB = dyn_cast<CallBase>(U.getUser())) if (CB->isCallee(&U)) { Constant *RVCCast = - ConstantExpr::getTruncOrBitCast(RVC, CB->getType()); + CB->getType() == RVC->getType() + ? RVC + : ConstantExpr::getTruncOrBitCast(RVC, CB->getType()); Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed; } } else { assert(isa<CallBase>(AnchorValue) && "Expcected a function or call base anchor!"); Constant *RVCCast = - ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType()); + AnchorValue.getType() == RVC->getType() + ? RVC + : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType()); Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast); } if (Changed == ChangeStatus::CHANGED) @@ -1157,8 +1243,7 @@ ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) { const auto &RetValAA = A.getAAFor<AAReturnedValues>( *this, IRPosition::function(*CB->getCalledFunction())); LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: " - << static_cast<const AbstractAttribute &>(RetValAA) - << "\n"); + << RetValAA << "\n"); // Skip dead ends, thus if we do not know anything about the returned // call we mark it as unresolved and it will stay that way. @@ -1393,7 +1478,7 @@ ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) { auto CheckRWInstForNoSync = [&](Instruction &I) { /// We are looking for volatile instructions or Non-Relaxed atomics. - /// FIXME: We should ipmrove the handling of intrinsics. + /// FIXME: We should improve the handling of intrinsics. if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I)) return true; @@ -1532,6 +1617,115 @@ struct AANoFreeCallSite final : AANoFreeImpl { void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); } }; +/// NoFree attribute for floating values. +struct AANoFreeFloating : AANoFreeImpl { + AANoFreeFloating(const IRPosition &IRP) : AANoFreeImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)} + + /// See Abstract Attribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + const IRPosition &IRP = getIRPosition(); + + const auto &NoFreeAA = + A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP)); + if (NoFreeAA.isAssumedNoFree()) + return ChangeStatus::UNCHANGED; + + Value &AssociatedValue = getIRPosition().getAssociatedValue(); + auto Pred = [&](const Use &U, bool &Follow) -> bool { + Instruction *UserI = cast<Instruction>(U.getUser()); + if (auto *CB = dyn_cast<CallBase>(UserI)) { + if (CB->isBundleOperand(&U)) + return false; + if (!CB->isArgOperand(&U)) + return true; + unsigned ArgNo = CB->getArgOperandNo(&U); + + const auto &NoFreeArg = A.getAAFor<AANoFree>( + *this, IRPosition::callsite_argument(*CB, ArgNo)); + return NoFreeArg.isAssumedNoFree(); + } + + if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) || + isa<PHINode>(UserI) || isa<SelectInst>(UserI)) { + Follow = true; + return true; + } + + // Unknown user. + return false; + }; + if (!A.checkForAllUses(Pred, *this, AssociatedValue)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } +}; + +/// NoFree attribute for a call site argument. +struct AANoFreeArgument final : AANoFreeFloating { + AANoFreeArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) } +}; + +/// NoFree attribute for call site arguments. +struct AANoFreeCallSiteArgument final : AANoFreeFloating { + AANoFreeCallSiteArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Argument *Arg = getAssociatedArgument(); + if (!Arg) + return indicatePessimisticFixpoint(); + const IRPosition &ArgPos = IRPosition::argument(*Arg); + auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos); + return clampStateAndIndicateChange( + getState(), static_cast<const AANoFree::StateType &>(ArgAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)}; +}; + +/// NoFree attribute for function return value. +struct AANoFreeReturned final : AANoFreeFloating { + AANoFreeReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) { + llvm_unreachable("NoFree is not applicable to function returns!"); + } + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + llvm_unreachable("NoFree is not applicable to function returns!"); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + llvm_unreachable("NoFree is not applicable to function returns!"); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} +}; + +/// NoFree attribute deduction for a call site return value. +struct AANoFreeCallSiteReturned final : AANoFreeFloating { + AANoFreeCallSiteReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) {} + + ChangeStatus manifest(Attributor &A) override { + return ChangeStatus::UNCHANGED; + } + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) } +}; + /// ------------------------ NonNull Argument Attribute ------------------------ static int64_t getKnownNonNullAndDerefBytesForUse( Attributor &A, AbstractAttribute &QueryingAA, Value &AssociatedValue, @@ -1558,30 +1752,49 @@ static int64_t getKnownNonNullAndDerefBytesForUse( unsigned ArgNo = ICS.getArgumentNo(U); IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo); - auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP); + // As long as we only use known information there is no need to track + // dependences here. + auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP, + /* TrackDependence */ false); IsNonNull |= DerefAA.isKnownNonNull(); return DerefAA.getKnownDereferenceableBytes(); } + // We need to follow common pointer manipulation uses to the accesses they + // feed into. We can try to be smart to avoid looking through things we do not + // like for now, e.g., non-inbounds GEPs. + if (isa<CastInst>(I)) { + TrackUse = true; + return 0; + } + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) + if (GEP->hasAllConstantIndices()) { + TrackUse = true; + return 0; + } + int64_t Offset; if (const Value *Base = getBasePointerOfAccessPointerOperand(I, Offset, DL)) { - if (Base == &AssociatedValue && getPointerOperand(I) == UseV) { + if (Base == &AssociatedValue && + Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) { int64_t DerefBytes = - Offset + (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()); + (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset; IsNonNull |= !NullPointerIsDefined; - return DerefBytes; + return std::max(int64_t(0), DerefBytes); } } - if (const Value *Base = - GetPointerBaseWithConstantOffset(UseV, Offset, DL, - /*AllowNonInbounds*/ false)) { - auto &DerefAA = - A.getAAFor<AADereferenceable>(QueryingAA, IRPosition::value(*Base)); - IsNonNull |= (!NullPointerIsDefined && DerefAA.isKnownNonNull()); - IsNonNull |= (!NullPointerIsDefined && (Offset != 0)); - int64_t DerefBytes = DerefAA.getKnownDereferenceableBytes(); - return std::max(int64_t(0), DerefBytes - Offset); + + /// Corner case when an offset is 0. + if (const Value *Base = getBasePointerOfAccessPointerOperand( + I, Offset, DL, /*AllowNonInbounds*/ true)) { + if (Offset == 0 && Base == &AssociatedValue && + Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) { + int64_t DerefBytes = + (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()); + IsNonNull |= !NullPointerIsDefined; + return std::max(int64_t(0), DerefBytes); + } } return 0; @@ -1599,6 +1812,8 @@ struct AANonNullImpl : AANonNull { if (!NullIsDefined && hasAttr({Attribute::NonNull, Attribute::Dereferenceable})) indicateOptimisticFixpoint(); + else if (isa<ConstantPointerNull>(getAssociatedValue())) + indicatePessimisticFixpoint(); else AANonNull::initialize(A); } @@ -1609,7 +1824,7 @@ struct AANonNullImpl : AANonNull { bool TrackUse = false; getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse); - takeKnownMaximum(IsNonNull); + setKnown(IsNonNull); return TrackUse; } @@ -1629,24 +1844,6 @@ struct AANonNullFloating using Base = AAFromMustBeExecutedContext<AANonNull, AANonNullImpl>; AANonNullFloating(const IRPosition &IRP) : Base(IRP) {} - /// See AbstractAttribute::initialize(...). - void initialize(Attributor &A) override { - Base::initialize(A); - - if (isAtFixpoint()) - return; - - const IRPosition &IRP = getIRPosition(); - const Value &V = IRP.getAssociatedValue(); - const DataLayout &DL = A.getDataLayout(); - - // TODO: This context sensitive query should be removed once we can do - // context sensitive queries in the genericValueTraversal below. - if (isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr, IRP.getCtxI(), - /* TODO: DT */ nullptr)) - indicateOptimisticFixpoint(); - } - /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { ChangeStatus Change = Base::updateImpl(A); @@ -1654,20 +1851,24 @@ struct AANonNullFloating return Change; if (!NullIsDefined) { - const auto &DerefAA = A.getAAFor<AADereferenceable>(*this, getIRPosition()); + const auto &DerefAA = + A.getAAFor<AADereferenceable>(*this, getIRPosition()); if (DerefAA.getAssumedDereferenceableBytes()) return Change; } const DataLayout &DL = A.getDataLayout(); - auto VisitValueCB = [&](Value &V, AAAlign::StateType &T, + DominatorTree *DT = nullptr; + InformationCache &InfoCache = A.getInfoCache(); + if (const Function *Fn = getAnchorScope()) + DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn); + + auto VisitValueCB = [&](Value &V, AANonNull::StateType &T, bool Stripped) -> bool { const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V)); if (!Stripped && this == &AA) { - if (!isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr, - /* CtxI */ getCtxI(), - /* TODO: DT */ nullptr)) + if (!isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr, getCtxI(), DT)) T.indicatePessimisticFixpoint(); } else { // Use abstract attribute information. @@ -1814,6 +2015,208 @@ struct AANoRecurseCallSite final : AANoRecurseImpl { void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); } }; +/// -------------------- Undefined-Behavior Attributes ------------------------ + +struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { + AAUndefinedBehaviorImpl(const IRPosition &IRP) : AAUndefinedBehavior(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + // through a pointer (i.e. also branches etc.) + ChangeStatus updateImpl(Attributor &A) override { + const size_t UBPrevSize = KnownUBInsts.size(); + const size_t NoUBPrevSize = AssumedNoUBInsts.size(); + + auto InspectMemAccessInstForUB = [&](Instruction &I) { + // Skip instructions that are already saved. + if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) + return true; + + // If we reach here, we know we have an instruction + // that accesses memory through a pointer operand, + // for which getPointerOperand() should give it to us. + const Value *PtrOp = + Attributor::getPointerOperand(&I, /* AllowVolatile */ true); + assert(PtrOp && + "Expected pointer operand of memory accessing instruction"); + + // A memory access through a pointer is considered UB + // only if the pointer has constant null value. + // TODO: Expand it to not only check constant values. + if (!isa<ConstantPointerNull>(PtrOp)) { + AssumedNoUBInsts.insert(&I); + return true; + } + const Type *PtrTy = PtrOp->getType(); + + // Because we only consider instructions inside functions, + // assume that a parent function exists. + const Function *F = I.getFunction(); + + // A memory access using constant null pointer is only considered UB + // if null pointer is _not_ defined for the target platform. + if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace())) + AssumedNoUBInsts.insert(&I); + else + KnownUBInsts.insert(&I); + return true; + }; + + auto InspectBrInstForUB = [&](Instruction &I) { + // A conditional branch instruction is considered UB if it has `undef` + // condition. + + // Skip instructions that are already saved. + if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) + return true; + + // We know we have a branch instruction. + auto BrInst = cast<BranchInst>(&I); + + // Unconditional branches are never considered UB. + if (BrInst->isUnconditional()) + return true; + + // Either we stopped and the appropriate action was taken, + // or we got back a simplified value to continue. + Optional<Value *> SimplifiedCond = + stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst); + if (!SimplifiedCond.hasValue()) + return true; + AssumedNoUBInsts.insert(&I); + return true; + }; + + A.checkForAllInstructions(InspectMemAccessInstForUB, *this, + {Instruction::Load, Instruction::Store, + Instruction::AtomicCmpXchg, + Instruction::AtomicRMW}); + A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br}); + if (NoUBPrevSize != AssumedNoUBInsts.size() || + UBPrevSize != KnownUBInsts.size()) + return ChangeStatus::CHANGED; + return ChangeStatus::UNCHANGED; + } + + bool isKnownToCauseUB(Instruction *I) const override { + return KnownUBInsts.count(I); + } + + bool isAssumedToCauseUB(Instruction *I) const override { + // In simple words, if an instruction is not in the assumed to _not_ + // cause UB, then it is assumed UB (that includes those + // in the KnownUBInsts set). The rest is boilerplate + // is to ensure that it is one of the instructions we test + // for UB. + + switch (I->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::AtomicCmpXchg: + case Instruction::AtomicRMW: + return !AssumedNoUBInsts.count(I); + case Instruction::Br: { + auto BrInst = cast<BranchInst>(I); + if (BrInst->isUnconditional()) + return false; + return !AssumedNoUBInsts.count(I); + } break; + default: + return false; + } + return false; + } + + ChangeStatus manifest(Attributor &A) override { + if (KnownUBInsts.empty()) + return ChangeStatus::UNCHANGED; + for (Instruction *I : KnownUBInsts) + A.changeToUnreachableAfterManifest(I); + return ChangeStatus::CHANGED; + } + + /// See AbstractAttribute::getAsStr() + const std::string getAsStr() const override { + return getAssumed() ? "undefined-behavior" : "no-ub"; + } + + /// Note: The correctness of this analysis depends on the fact that the + /// following 2 sets will stop changing after some point. + /// "Change" here means that their size changes. + /// The size of each set is monotonically increasing + /// (we only add items to them) and it is upper bounded by the number of + /// instructions in the processed function (we can never save more + /// elements in either set than this number). Hence, at some point, + /// they will stop increasing. + /// Consequently, at some point, both sets will have stopped + /// changing, effectively making the analysis reach a fixpoint. + + /// Note: These 2 sets are disjoint and an instruction can be considered + /// one of 3 things: + /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in + /// the KnownUBInsts set. + /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior + /// has a reason to assume it). + /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior + /// could not find a reason to assume or prove that it can cause UB, + /// hence it assumes it doesn't. We have a set for these instructions + /// so that we don't reprocess them in every update. + /// Note however that instructions in this set may cause UB. + +protected: + /// A set of all live instructions _known_ to cause UB. + SmallPtrSet<Instruction *, 8> KnownUBInsts; + +private: + /// A set of all the (live) instructions that are assumed to _not_ cause UB. + SmallPtrSet<Instruction *, 8> AssumedNoUBInsts; + + // Should be called on updates in which if we're processing an instruction + // \p I that depends on a value \p V, one of the following has to happen: + // - If the value is assumed, then stop. + // - If the value is known but undef, then consider it UB. + // - Otherwise, do specific processing with the simplified value. + // We return None in the first 2 cases to signify that an appropriate + // action was taken and the caller should stop. + // Otherwise, we return the simplified value that the caller should + // use for specific processing. + Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V, + Instruction *I) { + const auto &ValueSimplifyAA = + A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V)); + Optional<Value *> SimplifiedV = + ValueSimplifyAA.getAssumedSimplifiedValue(A); + if (!ValueSimplifyAA.isKnown()) { + // Don't depend on assumed values. + return llvm::None; + } + if (!SimplifiedV.hasValue()) { + // If it is known (which we tested above) but it doesn't have a value, + // then we can assume `undef` and hence the instruction is UB. + KnownUBInsts.insert(I); + return llvm::None; + } + Value *Val = SimplifiedV.getValue(); + if (isa<UndefValue>(Val)) { + KnownUBInsts.insert(I); + return llvm::None; + } + return Val; + } +}; + +struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl { + AAUndefinedBehaviorFunction(const IRPosition &IRP) + : AAUndefinedBehaviorImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECL(UndefinedBehaviorInstruction, Instruction, + "Number of instructions known to have UB"); + BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) += + KnownUBInsts.size(); + } +}; + /// ------------------------ Will-Return Attributes ---------------------------- // Helper function that checks whether a function has any cycle. @@ -1914,6 +2317,32 @@ struct AAWillReturnCallSite final : AAWillReturnImpl { void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); } }; +/// -------------------AAReachability Attribute-------------------------- + +struct AAReachabilityImpl : AAReachability { + AAReachabilityImpl(const IRPosition &IRP) : AAReachability(IRP) {} + + const std::string getAsStr() const override { + // TODO: Return the number of reachable queries. + return "reachable"; + } + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { indicatePessimisticFixpoint(); } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + return indicatePessimisticFixpoint(); + } +}; + +struct AAReachabilityFunction final : public AAReachabilityImpl { + AAReachabilityFunction(const IRPosition &IRP) : AAReachabilityImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); } +}; + /// ------------------------ NoAlias Argument Attribute ------------------------ struct AANoAliasImpl : AANoAlias { @@ -1954,8 +2383,43 @@ struct AANoAliasFloating final : AANoAliasImpl { /// NoAlias attribute for an argument. struct AANoAliasArgument final : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> { - AANoAliasArgument(const IRPosition &IRP) - : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>(IRP) {} + using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>; + AANoAliasArgument(const IRPosition &IRP) : Base(IRP) {} + + /// See AbstractAttribute::update(...). + ChangeStatus updateImpl(Attributor &A) override { + // We have to make sure no-alias on the argument does not break + // synchronization when this is a callback argument, see also [1] below. + // If synchronization cannot be affected, we delegate to the base updateImpl + // function, otherwise we give up for now. + + // If the function is no-sync, no-alias cannot break synchronization. + const auto &NoSyncAA = A.getAAFor<AANoSync>( + *this, IRPosition::function_scope(getIRPosition())); + if (NoSyncAA.isAssumedNoSync()) + return Base::updateImpl(A); + + // If the argument is read-only, no-alias cannot break synchronization. + const auto &MemBehaviorAA = + A.getAAFor<AAMemoryBehavior>(*this, getIRPosition()); + if (MemBehaviorAA.isAssumedReadOnly()) + return Base::updateImpl(A); + + // If the argument is never passed through callbacks, no-alias cannot break + // synchronization. + if (A.checkForAllCallSites( + [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this, + true)) + return Base::updateImpl(A); + + // TODO: add no-alias but make sure it doesn't break synchronization by + // introducing fake uses. See: + // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel, + // International Workshop on OpenMP 2018, + // http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf + + return indicatePessimisticFixpoint(); + } /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) } @@ -1987,6 +2451,8 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { // (i) Check whether noalias holds in the definition. auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP); + LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] check definition: " << V + << " :: " << NoAliasAA << "\n"); if (!NoAliasAA.isAssumedNoAlias()) return indicatePessimisticFixpoint(); @@ -2008,6 +2474,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { // (iii) Check there is no other pointer argument which could alias with the // value. + // TODO: AbstractCallSite ImmutableCallSite ICS(&getAnchorValue()); for (unsigned i = 0; i < ICS.getNumArgOperands(); i++) { if (getArgNo() == (int)i) @@ -2018,7 +2485,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { if (const Function *F = getAnchorScope()) { if (AAResults *AAR = A.getInfoCache().getAAResultsForFunction(*F)) { - bool IsAliasing = AAR->isNoAlias(&getAssociatedValue(), ArgOp); + bool IsAliasing = !AAR->isNoAlias(&getAssociatedValue(), ArgOp); LLVM_DEBUG(dbgs() << "[Attributor][NoAliasCSArg] Check alias between " "callsite arguments " @@ -2026,7 +2493,7 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { << getAssociatedValue() << " " << *ArgOp << " => " << (IsAliasing ? "" : "no-") << "alias \n"); - if (IsAliasing) + if (!IsAliasing) continue; } } @@ -2108,42 +2575,229 @@ struct AANoAliasCallSiteReturned final : AANoAliasImpl { /// -------------------AAIsDead Function Attribute----------------------- -struct AAIsDeadImpl : public AAIsDead { - AAIsDeadImpl(const IRPosition &IRP) : AAIsDead(IRP) {} +struct AAIsDeadValueImpl : public AAIsDead { + AAIsDeadValueImpl(const IRPosition &IRP) : AAIsDead(IRP) {} + + /// See AAIsDead::isAssumedDead(). + bool isAssumedDead() const override { return getAssumed(); } + + /// See AAIsDead::isAssumedDead(BasicBlock *). + bool isAssumedDead(const BasicBlock *BB) const override { return false; } + + /// See AAIsDead::isKnownDead(BasicBlock *). + bool isKnownDead(const BasicBlock *BB) const override { return false; } + + /// See AAIsDead::isAssumedDead(Instruction *I). + bool isAssumedDead(const Instruction *I) const override { + return I == getCtxI() && isAssumedDead(); + } + + /// See AAIsDead::isKnownDead(Instruction *I). + bool isKnownDead(const Instruction *I) const override { + return I == getCtxI() && getKnown(); + } + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return isAssumedDead() ? "assumed-dead" : "assumed-live"; + } +}; + +struct AAIsDeadFloating : public AAIsDeadValueImpl { + AAIsDeadFloating(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - const Function *F = getAssociatedFunction(); - if (F && !F->isDeclaration()) - exploreFromEntry(A, F); + if (Instruction *I = dyn_cast<Instruction>(&getAssociatedValue())) + if (!wouldInstructionBeTriviallyDead(I)) + indicatePessimisticFixpoint(); + if (isa<UndefValue>(getAssociatedValue())) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + auto UsePred = [&](const Use &U, bool &Follow) { + Instruction *UserI = cast<Instruction>(U.getUser()); + if (CallSite CS = CallSite(UserI)) { + if (!CS.isArgOperand(&U)) + return false; + const IRPosition &CSArgPos = + IRPosition::callsite_argument(CS, CS.getArgumentNo(&U)); + const auto &CSArgIsDead = A.getAAFor<AAIsDead>(*this, CSArgPos); + return CSArgIsDead.isAssumedDead(); + } + if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) { + const IRPosition &RetPos = IRPosition::returned(*RI->getFunction()); + const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, RetPos); + return RetIsDeadAA.isAssumedDead(); + } + Follow = true; + return wouldInstructionBeTriviallyDead(UserI); + }; + + if (!A.checkForAllUses(UsePred, *this, getAssociatedValue())) + return indicatePessimisticFixpoint(); + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + Value &V = getAssociatedValue(); + if (auto *I = dyn_cast<Instruction>(&V)) + if (wouldInstructionBeTriviallyDead(I)) { + A.deleteAfterManifest(*I); + return ChangeStatus::CHANGED; + } + + if (V.use_empty()) + return ChangeStatus::UNCHANGED; + + UndefValue &UV = *UndefValue::get(V.getType()); + bool AnyChange = A.changeValueAfterManifest(V, UV); + return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } - void exploreFromEntry(Attributor &A, const Function *F) { - ToBeExploredPaths.insert(&(F->getEntryBlock().front())); + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(IsDead) + } +}; - for (size_t i = 0; i < ToBeExploredPaths.size(); ++i) - if (const Instruction *NextNoReturnI = - findNextNoReturn(A, ToBeExploredPaths[i])) - NoReturnCalls.insert(NextNoReturnI); +struct AAIsDeadArgument : public AAIsDeadFloating { + AAIsDeadArgument(const IRPosition &IRP) : AAIsDeadFloating(IRP) {} - // Mark the block live after we looked for no-return instructions. - assumeLive(A, F->getEntryBlock()); + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (!getAssociatedFunction()->hasExactDefinition()) + indicatePessimisticFixpoint(); } - /// Find the next assumed noreturn instruction in the block of \p I starting - /// from, thus including, \p I. - /// - /// The caller is responsible to monitor the ToBeExploredPaths set as new - /// instructions discovered in other basic block will be placed in there. - /// - /// \returns The next assumed noreturn instructions in the block of \p I - /// starting from, thus including, \p I. - const Instruction *findNextNoReturn(Attributor &A, const Instruction *I); + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Changed = AAIsDeadFloating::manifest(A); + Argument &Arg = *getAssociatedArgument(); + if (Arg.getParent()->hasLocalLinkage()) + if (A.registerFunctionSignatureRewrite( + Arg, /* ReplacementTypes */ {}, + Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{}, + Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) + return ChangeStatus::CHANGED; + return Changed; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) } +}; + +struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl { + AAIsDeadCallSiteArgument(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (isa<UndefValue>(getAssociatedValue())) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Argument *Arg = getAssociatedArgument(); + if (!Arg) + return indicatePessimisticFixpoint(); + const IRPosition &ArgPos = IRPosition::argument(*Arg); + auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos); + return clampStateAndIndicateChange( + getState(), static_cast<const AAIsDead::StateType &>(ArgAA.getState())); + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + CallBase &CB = cast<CallBase>(getAnchorValue()); + Use &U = CB.getArgOperandUse(getArgNo()); + assert(!isa<UndefValue>(U.get()) && + "Expected undef values to be filtered out!"); + UndefValue &UV = *UndefValue::get(U->getType()); + if (A.changeUseAfterManifest(U, UV)) + return ChangeStatus::CHANGED; + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) } +}; + +struct AAIsDeadReturned : public AAIsDeadValueImpl { + AAIsDeadReturned(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + + auto PredForCallSite = [&](AbstractCallSite ACS) { + if (ACS.isCallbackCall()) + return false; + const IRPosition &CSRetPos = + IRPosition::callsite_returned(ACS.getCallSite()); + const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, CSRetPos); + return RetIsDeadAA.isAssumedDead(); + }; + + if (!A.checkForAllCallSites(PredForCallSite, *this, true)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + // TODO: Rewrite the signature to return void? + bool AnyChange = false; + UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType()); + auto RetInstPred = [&](Instruction &I) { + ReturnInst &RI = cast<ReturnInst>(I); + if (!isa<UndefValue>(RI.getReturnValue())) + AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV); + return true; + }; + A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret}); + return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) } +}; + +struct AAIsDeadCallSiteReturned : public AAIsDeadFloating { + AAIsDeadCallSiteReturned(const IRPosition &IRP) : AAIsDeadFloating(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(IsDead) } +}; + +struct AAIsDeadFunction : public AAIsDead { + AAIsDeadFunction(const IRPosition &IRP) : AAIsDead(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + const Function *F = getAssociatedFunction(); + if (F && !F->isDeclaration()) { + ToBeExploredFrom.insert(&F->getEntryBlock().front()); + assumeLive(A, F->getEntryBlock()); + } + } /// See AbstractAttribute::getAsStr(). const std::string getAsStr() const override { return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" + - std::to_string(getAssociatedFunction()->size()) + "][#NRI " + - std::to_string(NoReturnCalls.size()) + "]"; + std::to_string(getAssociatedFunction()->size()) + "][#TBEP " + + std::to_string(ToBeExploredFrom.size()) + "][#KDE " + + std::to_string(KnownDeadEnds.size()) + "]"; } /// See AbstractAttribute::manifest(...). @@ -2164,73 +2818,22 @@ struct AAIsDeadImpl : public AAIsDead { // function allows to catch asynchronous exceptions. bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F); - for (const Instruction *NRC : NoReturnCalls) { - Instruction *I = const_cast<Instruction *>(NRC); - BasicBlock *BB = I->getParent(); - Instruction *SplitPos = I->getNextNode(); - // TODO: mark stuff before unreachable instructions as dead. - - if (auto *II = dyn_cast<InvokeInst>(I)) { - // If we keep the invoke the split position is at the beginning of the - // normal desitination block (it invokes a noreturn function after all). - BasicBlock *NormalDestBB = II->getNormalDest(); - SplitPos = &NormalDestBB->front(); - - /// Invoke is replaced with a call and unreachable is placed after it if - /// the callee is nounwind and noreturn. Otherwise, we keep the invoke - /// and only place an unreachable in the normal successor. - if (Invoke2CallAllowed) { - if (II->getCalledFunction()) { - const IRPosition &IPos = IRPosition::callsite_function(*II); - const auto &AANoUnw = A.getAAFor<AANoUnwind>(*this, IPos); - if (AANoUnw.isAssumedNoUnwind()) { - LLVM_DEBUG(dbgs() - << "[AAIsDead] Replace invoke with call inst\n"); - // We do not need an invoke (II) but instead want a call followed - // by an unreachable. However, we do not remove II as other - // abstract attributes might have it cached as part of their - // results. Given that we modify the CFG anyway, we simply keep II - // around but in a new dead block. To avoid II being live through - // a different edge we have to ensure the block we place it in is - // only reached from the current block of II and then not reached - // at all when we insert the unreachable. - SplitBlockPredecessors(NormalDestBB, {BB}, ".i2c"); - CallInst *CI = createCallMatchingInvoke(II); - CI->insertBefore(II); - CI->takeName(II); - II->replaceAllUsesWith(CI); - SplitPos = CI->getNextNode(); - } - } - } - - if (SplitPos == &NormalDestBB->front()) { - // If this is an invoke of a noreturn function the edge to the normal - // destination block is dead but not necessarily the block itself. - // TODO: We need to move to an edge based system during deduction and - // also manifest. - assert(!NormalDestBB->isLandingPad() && - "Expected the normal destination not to be a landingpad!"); - if (NormalDestBB->getUniquePredecessor() == BB) { - assumeLive(A, *NormalDestBB); - } else { - BasicBlock *SplitBB = - SplitBlockPredecessors(NormalDestBB, {BB}, ".dead"); - // The split block is live even if it contains only an unreachable - // instruction at the end. - assumeLive(A, *SplitBB); - SplitPos = SplitBB->getTerminator(); - HasChanged = ChangeStatus::CHANGED; - } - } - } - - if (isa_and_nonnull<UnreachableInst>(SplitPos)) + KnownDeadEnds.set_union(ToBeExploredFrom); + for (const Instruction *DeadEndI : KnownDeadEnds) { + auto *CB = dyn_cast<CallBase>(DeadEndI); + if (!CB) + continue; + const auto &NoReturnAA = + A.getAAFor<AANoReturn>(*this, IRPosition::callsite_function(*CB)); + bool MayReturn = !NoReturnAA.isAssumedNoReturn(); + if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB))) continue; - BB = SplitPos->getParent(); - SplitBlock(BB, SplitPos); - changeToUnreachable(BB->getTerminator(), /* UseLLVMTrap */ false); + if (auto *II = dyn_cast<InvokeInst>(DeadEndI)) + A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II)); + else + A.changeToUnreachableAfterManifest( + const_cast<Instruction *>(DeadEndI->getNextNode())); HasChanged = ChangeStatus::CHANGED; } @@ -2244,6 +2847,12 @@ struct AAIsDeadImpl : public AAIsDead { /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override; + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + + /// Returns true if the function is assumed dead. + bool isAssumedDead() const override { return false; } + /// See AAIsDead::isAssumedDead(BasicBlock *). bool isAssumedDead(const BasicBlock *BB) const override { assert(BB->getParent() == getAssociatedFunction() && @@ -2272,8 +2881,14 @@ struct AAIsDeadImpl : public AAIsDead { if (!AssumedLiveBlocks.count(I->getParent())) return true; - // If it is not after a noreturn call, than it is live. - return isAfterNoReturn(I); + // If it is not after a liveness barrier it is live. + const Instruction *PrevI = I->getPrevNode(); + while (PrevI) { + if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI)) + return true; + PrevI = PrevI->getPrevNode(); + } + return false; } /// See AAIsDead::isKnownDead(Instruction *I). @@ -2281,9 +2896,6 @@ struct AAIsDeadImpl : public AAIsDead { return getKnown() && isAssumedDead(I); } - /// Check if instruction is after noreturn call, in other words, assumed dead. - bool isAfterNoReturn(const Instruction *I) const; - /// Determine if \p F might catch asynchronous exceptions. static bool mayCatchAsynchronousExceptions(const Function &F) { return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F); @@ -2291,9 +2903,9 @@ struct AAIsDeadImpl : public AAIsDead { /// Assume \p BB is (partially) live now and indicate to the Attributor \p A /// that internal function called from \p BB should now be looked at. - void assumeLive(Attributor &A, const BasicBlock &BB) { + bool assumeLive(Attributor &A, const BasicBlock &BB) { if (!AssumedLiveBlocks.insert(&BB).second) - return; + return false; // We assume that all of BB is (probably) live now and if there are calls to // internal functions we will assume that those are now live as well. This @@ -2304,140 +2916,219 @@ struct AAIsDeadImpl : public AAIsDead { if (const Function *F = ICS.getCalledFunction()) if (F->hasLocalLinkage()) A.markLiveInternalFunction(*F); + return true; } - /// Collection of to be explored paths. - SmallSetVector<const Instruction *, 8> ToBeExploredPaths; + /// Collection of instructions that need to be explored again, e.g., we + /// did assume they do not transfer control to (one of their) successors. + SmallSetVector<const Instruction *, 8> ToBeExploredFrom; + + /// Collection of instructions that are known to not transfer control. + SmallSetVector<const Instruction *, 8> KnownDeadEnds; /// Collection of all assumed live BasicBlocks. DenseSet<const BasicBlock *> AssumedLiveBlocks; - - /// Collection of calls with noreturn attribute, assumed or knwon. - SmallSetVector<const Instruction *, 4> NoReturnCalls; }; -struct AAIsDeadFunction final : public AAIsDeadImpl { - AAIsDeadFunction(const IRPosition &IRP) : AAIsDeadImpl(IRP) {} +static bool +identifyAliveSuccessors(Attributor &A, const CallBase &CB, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + const IRPosition &IPos = IRPosition::callsite_function(CB); + + const auto &NoReturnAA = A.getAAFor<AANoReturn>(AA, IPos); + if (NoReturnAA.isAssumedNoReturn()) + return !NoReturnAA.isKnownNoReturn(); + if (CB.isTerminator()) + AliveSuccessors.push_back(&CB.getSuccessor(0)->front()); + else + AliveSuccessors.push_back(CB.getNextNode()); + return false; +} - /// See AbstractAttribute::trackStatistics() - void trackStatistics() const override { - STATS_DECL(PartiallyDeadBlocks, Function, - "Number of basic blocks classified as partially dead"); - BUILD_STAT_NAME(PartiallyDeadBlocks, Function) += NoReturnCalls.size(); +static bool +identifyAliveSuccessors(Attributor &A, const InvokeInst &II, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + bool UsedAssumedInformation = + identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors); + + // First, determine if we can change an invoke to a call assuming the + // callee is nounwind. This is not possible if the personality of the + // function allows to catch asynchronous exceptions. + if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) { + AliveSuccessors.push_back(&II.getUnwindDest()->front()); + } else { + const IRPosition &IPos = IRPosition::callsite_function(II); + const auto &AANoUnw = A.getAAFor<AANoUnwind>(AA, IPos); + if (AANoUnw.isAssumedNoUnwind()) { + UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind(); + } else { + AliveSuccessors.push_back(&II.getUnwindDest()->front()); + } } -}; + return UsedAssumedInformation; +} -bool AAIsDeadImpl::isAfterNoReturn(const Instruction *I) const { - const Instruction *PrevI = I->getPrevNode(); - while (PrevI) { - if (NoReturnCalls.count(PrevI)) - return true; - PrevI = PrevI->getPrevNode(); +static Optional<ConstantInt *> +getAssumedConstant(Attributor &A, const Value &V, AbstractAttribute &AA, + bool &UsedAssumedInformation) { + const auto &ValueSimplifyAA = + A.getAAFor<AAValueSimplify>(AA, IRPosition::value(V)); + Optional<Value *> SimplifiedV = ValueSimplifyAA.getAssumedSimplifiedValue(A); + UsedAssumedInformation |= !ValueSimplifyAA.isKnown(); + if (!SimplifiedV.hasValue()) + return llvm::None; + if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) + return llvm::None; + return dyn_cast_or_null<ConstantInt>(SimplifiedV.getValue()); +} + +static bool +identifyAliveSuccessors(Attributor &A, const BranchInst &BI, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + bool UsedAssumedInformation = false; + if (BI.getNumSuccessors() == 1) { + AliveSuccessors.push_back(&BI.getSuccessor(0)->front()); + } else { + Optional<ConstantInt *> CI = + getAssumedConstant(A, *BI.getCondition(), AA, UsedAssumedInformation); + if (!CI.hasValue()) { + // No value yet, assume both edges are dead. + } else if (CI.getValue()) { + const BasicBlock *SuccBB = + BI.getSuccessor(1 - CI.getValue()->getZExtValue()); + AliveSuccessors.push_back(&SuccBB->front()); + } else { + AliveSuccessors.push_back(&BI.getSuccessor(0)->front()); + AliveSuccessors.push_back(&BI.getSuccessor(1)->front()); + UsedAssumedInformation = false; + } } - return false; + return UsedAssumedInformation; } -const Instruction *AAIsDeadImpl::findNextNoReturn(Attributor &A, - const Instruction *I) { - const BasicBlock *BB = I->getParent(); - const Function &F = *BB->getParent(); - - // Flag to determine if we can change an invoke to a call assuming the callee - // is nounwind. This is not possible if the personality of the function allows - // to catch asynchronous exceptions. - bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F); - - // TODO: We should have a function that determines if an "edge" is dead. - // Edges could be from an instruction to the next or from a terminator - // to the successor. For now, we need to special case the unwind block - // of InvokeInst below. - - while (I) { - ImmutableCallSite ICS(I); - - if (ICS) { - const IRPosition &IPos = IRPosition::callsite_function(ICS); - // Regarless of the no-return property of an invoke instruction we only - // learn that the regular successor is not reachable through this - // instruction but the unwind block might still be. - if (auto *Invoke = dyn_cast<InvokeInst>(I)) { - // Use nounwind to justify the unwind block is dead as well. - const auto &AANoUnw = A.getAAFor<AANoUnwind>(*this, IPos); - if (!Invoke2CallAllowed || !AANoUnw.isAssumedNoUnwind()) { - assumeLive(A, *Invoke->getUnwindDest()); - ToBeExploredPaths.insert(&Invoke->getUnwindDest()->front()); - } +static bool +identifyAliveSuccessors(Attributor &A, const SwitchInst &SI, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + bool UsedAssumedInformation = false; + Optional<ConstantInt *> CI = + getAssumedConstant(A, *SI.getCondition(), AA, UsedAssumedInformation); + if (!CI.hasValue()) { + // No value yet, assume all edges are dead. + } else if (CI.getValue()) { + for (auto &CaseIt : SI.cases()) { + if (CaseIt.getCaseValue() == CI.getValue()) { + AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front()); + return UsedAssumedInformation; } - - const auto &NoReturnAA = A.getAAFor<AANoReturn>(*this, IPos); - if (NoReturnAA.isAssumedNoReturn()) - return I; } - - I = I->getNextNode(); + AliveSuccessors.push_back(&SI.getDefaultDest()->front()); + return UsedAssumedInformation; + } else { + for (const BasicBlock *SuccBB : successors(SI.getParent())) + AliveSuccessors.push_back(&SuccBB->front()); } + return UsedAssumedInformation; +} - // get new paths (reachable blocks). - for (const BasicBlock *SuccBB : successors(BB)) { - assumeLive(A, *SuccBB); - ToBeExploredPaths.insert(&SuccBB->front()); - } +ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) { + ChangeStatus Change = ChangeStatus::UNCHANGED; - // No noreturn instruction found. - return nullptr; -} + LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/" + << getAssociatedFunction()->size() << "] BBs and " + << ToBeExploredFrom.size() << " exploration points and " + << KnownDeadEnds.size() << " known dead ends\n"); -ChangeStatus AAIsDeadImpl::updateImpl(Attributor &A) { - ChangeStatus Status = ChangeStatus::UNCHANGED; + // Copy and clear the list of instructions we need to explore from. It is + // refilled with instructions the next update has to look at. + SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(), + ToBeExploredFrom.end()); + decltype(ToBeExploredFrom) NewToBeExploredFrom; - // Temporary collection to iterate over existing noreturn instructions. This - // will alow easier modification of NoReturnCalls collection - SmallVector<const Instruction *, 8> NoReturnChanged; + SmallVector<const Instruction *, 8> AliveSuccessors; + while (!Worklist.empty()) { + const Instruction *I = Worklist.pop_back_val(); + LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n"); - for (const Instruction *I : NoReturnCalls) - NoReturnChanged.push_back(I); + AliveSuccessors.clear(); - for (const Instruction *I : NoReturnChanged) { - size_t Size = ToBeExploredPaths.size(); + bool UsedAssumedInformation = false; + switch (I->getOpcode()) { + // TODO: look for (assumed) UB to backwards propagate "deadness". + default: + if (I->isTerminator()) { + for (const BasicBlock *SuccBB : successors(I->getParent())) + AliveSuccessors.push_back(&SuccBB->front()); + } else { + AliveSuccessors.push_back(I->getNextNode()); + } + break; + case Instruction::Call: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I), + *this, AliveSuccessors); + break; + case Instruction::Invoke: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I), + *this, AliveSuccessors); + break; + case Instruction::Br: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I), + *this, AliveSuccessors); + break; + case Instruction::Switch: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I), + *this, AliveSuccessors); + break; + } - const Instruction *NextNoReturnI = findNextNoReturn(A, I); - if (NextNoReturnI != I) { - Status = ChangeStatus::CHANGED; - NoReturnCalls.remove(I); - if (NextNoReturnI) - NoReturnCalls.insert(NextNoReturnI); + if (UsedAssumedInformation) { + NewToBeExploredFrom.insert(I); + } else { + Change = ChangeStatus::CHANGED; + if (AliveSuccessors.empty() || + (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors())) + KnownDeadEnds.insert(I); } - // Explore new paths. - while (Size != ToBeExploredPaths.size()) { - Status = ChangeStatus::CHANGED; - if (const Instruction *NextNoReturnI = - findNextNoReturn(A, ToBeExploredPaths[Size++])) - NoReturnCalls.insert(NextNoReturnI); + LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: " + << AliveSuccessors.size() << " UsedAssumedInformation: " + << UsedAssumedInformation << "\n"); + + for (const Instruction *AliveSuccessor : AliveSuccessors) { + if (!I->isTerminator()) { + assert(AliveSuccessors.size() == 1 && + "Non-terminator expected to have a single successor!"); + Worklist.push_back(AliveSuccessor); + } else { + if (assumeLive(A, *AliveSuccessor->getParent())) + Worklist.push_back(AliveSuccessor); + } } } - LLVM_DEBUG(dbgs() << "[AAIsDead] AssumedLiveBlocks: " - << AssumedLiveBlocks.size() << " Total number of blocks: " - << getAssociatedFunction()->size() << "\n"); + ToBeExploredFrom = std::move(NewToBeExploredFrom); // If we know everything is live there is no need to query for liveness. - if (NoReturnCalls.empty() && - getAssociatedFunction()->size() == AssumedLiveBlocks.size()) { - // Indicating a pessimistic fixpoint will cause the state to be "invalid" - // which will cause the Attributor to not return the AAIsDead on request, - // which will prevent us from querying isAssumedDead(). - indicatePessimisticFixpoint(); - assert(!isValidState() && "Expected an invalid state!"); - Status = ChangeStatus::CHANGED; - } - - return Status; + // Instead, indicating a pessimistic fixpoint will cause the state to be + // "invalid" and all queries to be answered conservatively without lookups. + // To be in this state we have to (1) finished the exploration and (3) not + // discovered any non-trivial dead end and (2) not ruled unreachable code + // dead. + if (ToBeExploredFrom.empty() && + getAssociatedFunction()->size() == AssumedLiveBlocks.size() && + llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) { + return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0; + })) + return indicatePessimisticFixpoint(); + return Change; } /// Liveness information for a call sites. -struct AAIsDeadCallSite final : AAIsDeadImpl { - AAIsDeadCallSite(const IRPosition &IRP) : AAIsDeadImpl(IRP) {} +struct AAIsDeadCallSite final : AAIsDeadFunction { + AAIsDeadCallSite(const IRPosition &IRP) : AAIsDeadFunction(IRP) {} /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { @@ -2463,10 +3154,9 @@ struct AAIsDeadCallSite final : AAIsDeadImpl { template <> ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S, const DerefState &R) { - ChangeStatus CS0 = clampStateAndIndicateChange<IntegerState>( - S.DerefBytesState, R.DerefBytesState); - ChangeStatus CS1 = - clampStateAndIndicateChange<IntegerState>(S.GlobalState, R.GlobalState); + ChangeStatus CS0 = + clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState); + ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState); return CS0 | CS1; } @@ -2496,16 +3186,49 @@ struct AADereferenceableImpl : AADereferenceable { const StateType &getState() const override { return *this; } /// } + /// Helper function for collecting accessed bytes in must-be-executed-context + void addAccessedBytesForUse(Attributor &A, const Use *U, + const Instruction *I) { + const Value *UseV = U->get(); + if (!UseV->getType()->isPointerTy()) + return; + + Type *PtrTy = UseV->getType(); + const DataLayout &DL = A.getDataLayout(); + int64_t Offset; + if (const Value *Base = getBasePointerOfAccessPointerOperand( + I, Offset, DL, /*AllowNonInbounds*/ true)) { + if (Base == &getAssociatedValue() && + Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) { + uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType()); + addAccessedBytes(Offset, Size); + } + } + return; + } + /// See AAFromMustBeExecutedContext bool followUse(Attributor &A, const Use *U, const Instruction *I) { bool IsNonNull = false; bool TrackUse = false; int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse( A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse); + + addAccessedBytesForUse(A, U, I); takeKnownDerefBytesMaximum(DerefBytes); return TrackUse; } + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Change = AADereferenceable::manifest(A); + if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) { + removeAttrs({Attribute::DereferenceableOrNull}); + return ChangeStatus::CHANGED; + } + return Change; + } + void getDeducedAttributes(LLVMContext &Ctx, SmallVectorImpl<Attribute> &Attrs) const override { // TODO: Add *_globally support @@ -2564,6 +3287,8 @@ struct AADereferenceableFloating T.GlobalState &= DS.GlobalState; } + // TODO: Use `AAConstantRange` to infer dereferenceable bytes. + // For now we do not try to "increase" dereferenceability due to negative // indices as we first have to come up with code to deal with loops and // for overflows of the dereferenceable bytes. @@ -2654,30 +3379,6 @@ struct AADereferenceableCallSiteReturned final AADereferenceable, AADereferenceableImpl>; AADereferenceableCallSiteReturned(const IRPosition &IRP) : Base(IRP) {} - /// See AbstractAttribute::initialize(...). - void initialize(Attributor &A) override { - Base::initialize(A); - Function *F = getAssociatedFunction(); - if (!F) - indicatePessimisticFixpoint(); - } - - /// See AbstractAttribute::updateImpl(...). - ChangeStatus updateImpl(Attributor &A) override { - // TODO: Once we have call site specific value information we can provide - // call site specific liveness information and then it makes - // sense to specialize attributes for call sites arguments instead of - // redirecting requests to the callee argument. - - ChangeStatus Change = Base::updateImpl(A); - Function *F = getAssociatedFunction(); - const IRPosition &FnPos = IRPosition::returned(*F); - auto &FnAA = A.getAAFor<AADereferenceable>(*this, FnPos); - return Change | - clampStateAndIndicateChange( - getState(), static_cast<const DerefState &>(FnAA.getState())); - } - /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(dereferenceable); @@ -2686,16 +3387,69 @@ struct AADereferenceableCallSiteReturned final // ------------------------ Align Argument Attribute ------------------------ +static unsigned int getKnownAlignForUse(Attributor &A, + AbstractAttribute &QueryingAA, + Value &AssociatedValue, const Use *U, + const Instruction *I, bool &TrackUse) { + // We need to follow common pointer manipulation uses to the accesses they + // feed into. + if (isa<CastInst>(I)) { + // Follow all but ptr2int casts. + TrackUse = !isa<PtrToIntInst>(I); + return 0; + } + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (GEP->hasAllConstantIndices()) { + TrackUse = true; + return 0; + } + } + + unsigned Alignment = 0; + if (ImmutableCallSite ICS = ImmutableCallSite(I)) { + if (ICS.isBundleOperand(U) || ICS.isCallee(U)) + return 0; + + unsigned ArgNo = ICS.getArgumentNo(U); + IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo); + // As long as we only use known information there is no need to track + // dependences here. + auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP, + /* TrackDependence */ false); + Alignment = AlignAA.getKnownAlign(); + } + + const Value *UseV = U->get(); + if (auto *SI = dyn_cast<StoreInst>(I)) + Alignment = SI->getAlignment(); + else if (auto *LI = dyn_cast<LoadInst>(I)) + Alignment = LI->getAlignment(); + + if (Alignment <= 1) + return 0; + + auto &DL = A.getDataLayout(); + int64_t Offset; + + if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) { + if (Base == &AssociatedValue) { + // BasePointerAddr + Offset = Alignment * Q for some integer Q. + // So we can say that the maximum power of two which is a divisor of + // gcd(Offset, Alignment) is an alignment. + + uint32_t gcd = + greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment); + Alignment = llvm::PowerOf2Floor(gcd); + } + } + + return Alignment; +} struct AAAlignImpl : AAAlign { AAAlignImpl(const IRPosition &IRP) : AAAlign(IRP) {} - // Max alignemnt value allowed in IR - static const unsigned MAX_ALIGN = 1U << 29; - /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - takeAssumedMinimum(MAX_ALIGN); - SmallVector<Attribute, 4> Attrs; getAttrs({Attribute::Alignment}, Attrs); for (const Attribute &Attr : Attrs) @@ -2718,7 +3472,7 @@ struct AAAlignImpl : AAAlign { if (SI->getPointerOperand() == &AnchorVal) if (SI->getAlignment() < getAssumedAlign()) { STATS_DECLTRACK(AAAlign, Store, - "Number of times alignemnt added to a store"); + "Number of times alignment added to a store"); SI->setAlignment(Align(getAssumedAlign())); Changed = ChangeStatus::CHANGED; } @@ -2727,7 +3481,7 @@ struct AAAlignImpl : AAAlign { if (LI->getAlignment() < getAssumedAlign()) { LI->setAlignment(Align(getAssumedAlign())); STATS_DECLTRACK(AAAlign, Load, - "Number of times alignemnt added to a load"); + "Number of times alignment added to a load"); Changed = ChangeStatus::CHANGED; } } @@ -2748,6 +3502,16 @@ struct AAAlignImpl : AAAlign { Attrs.emplace_back( Attribute::getWithAlignment(Ctx, Align(getAssumedAlign()))); } + /// See AAFromMustBeExecutedContext + bool followUse(Attributor &A, const Use *U, const Instruction *I) { + bool TrackUse = false; + + unsigned int KnownAlign = + getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse); + takeKnownMaximum(KnownAlign); + + return TrackUse; + } /// See AbstractAttribute::getAsStr(). const std::string getAsStr() const override { @@ -2758,11 +3522,14 @@ struct AAAlignImpl : AAAlign { }; /// Align attribute for a floating value. -struct AAAlignFloating : AAAlignImpl { - AAAlignFloating(const IRPosition &IRP) : AAAlignImpl(IRP) {} +struct AAAlignFloating : AAFromMustBeExecutedContext<AAAlign, AAAlignImpl> { + using Base = AAFromMustBeExecutedContext<AAAlign, AAAlignImpl>; + AAAlignFloating(const IRPosition &IRP) : Base(IRP) {} /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { + Base::updateImpl(A); + const DataLayout &DL = A.getDataLayout(); auto VisitValueCB = [&](Value &V, AAAlign::StateType &T, @@ -2808,9 +3575,12 @@ struct AAAlignReturned final /// Align attribute for function argument. struct AAAlignArgument final - : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl> { + : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign, + AAAlignImpl> { AAAlignArgument(const IRPosition &IRP) - : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl>(IRP) {} + : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign, + AAAlignImpl>( + IRP) {} /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) } @@ -2824,35 +3594,39 @@ struct AAAlignCallSiteArgument final : AAAlignFloating { return AAAlignImpl::manifest(A); } + /// See AbstractAttribute::updateImpl(Attributor &A). + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Changed = AAAlignFloating::updateImpl(A); + if (Argument *Arg = getAssociatedArgument()) { + const auto &ArgAlignAA = A.getAAFor<AAAlign>( + *this, IRPosition::argument(*Arg), /* TrackDependence */ false, + DepClassTy::OPTIONAL); + takeKnownMaximum(ArgAlignAA.getKnownAlign()); + } + return Changed; + } + /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) } }; /// Align attribute deduction for a call site return value. -struct AAAlignCallSiteReturned final : AAAlignImpl { - AAAlignCallSiteReturned(const IRPosition &IRP) : AAAlignImpl(IRP) {} +struct AAAlignCallSiteReturned final + : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign, + AAAlignImpl> { + using Base = + AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign, + AAAlignImpl>; + AAAlignCallSiteReturned(const IRPosition &IRP) : Base(IRP) {} /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - AAAlignImpl::initialize(A); + Base::initialize(A); Function *F = getAssociatedFunction(); if (!F) indicatePessimisticFixpoint(); } - /// See AbstractAttribute::updateImpl(...). - ChangeStatus updateImpl(Attributor &A) override { - // TODO: Once we have call site specific value information we can provide - // call site specific liveness information and then it makes - // sense to specialize attributes for call sites arguments instead of - // redirecting requests to the callee argument. - Function *F = getAssociatedFunction(); - const IRPosition &FnPos = IRPosition::returned(*F); - auto &FnAA = A.getAAFor<AAAlign>(*this, FnPos); - return clampStateAndIndicateChange( - getState(), static_cast<const AAAlign::StateType &>(FnAA.getState())); - } - /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } }; @@ -2865,7 +3639,7 @@ struct AANoReturnImpl : public AANoReturn { void initialize(Attributor &A) override { AANoReturn::initialize(A); Function *F = getAssociatedFunction(); - if (!F || F->hasFnAttribute(Attribute::WillReturn)) + if (!F) indicatePessimisticFixpoint(); } @@ -2876,9 +3650,6 @@ struct AANoReturnImpl : public AANoReturn { /// See AbstractAttribute::updateImpl(Attributor &A). virtual ChangeStatus updateImpl(Attributor &A) override { - const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, getIRPosition()); - if (WillReturnAA.isKnownWillReturn()) - return indicatePessimisticFixpoint(); auto CheckForNoReturn = [](Instruction &) { return false; }; if (!A.checkForAllInstructions(CheckForNoReturn, *this, {(unsigned)Instruction::Ret})) @@ -2924,7 +3695,16 @@ struct AANoCaptureImpl : public AANoCapture { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - AANoCapture::initialize(A); + if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) { + indicateOptimisticFixpoint(); + return; + } + Function *AnchorScope = getAnchorScope(); + if (isFnInterfaceKind() && + (!AnchorScope || !AnchorScope->hasExactDefinition())) { + indicatePessimisticFixpoint(); + return; + } // You cannot "capture" null in the default address space. if (isa<ConstantPointerNull>(getAssociatedValue()) && @@ -2933,13 +3713,11 @@ struct AANoCaptureImpl : public AANoCapture { return; } - const IRPosition &IRP = getIRPosition(); - const Function *F = - getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope(); + const Function *F = getArgNo() >= 0 ? getAssociatedFunction() : AnchorScope; // Check what state the associated function can actually capture. if (F) - determineFunctionCaptureCapabilities(IRP, *F, *this); + determineFunctionCaptureCapabilities(getIRPosition(), *F, *this); else indicatePessimisticFixpoint(); } @@ -2967,7 +3745,7 @@ struct AANoCaptureImpl : public AANoCapture { /// state in memory and through "returning/throwing", respectively. static void determineFunctionCaptureCapabilities(const IRPosition &IRP, const Function &F, - IntegerState &State) { + BitIntegerState &State) { // TODO: Once we have memory behavior attributes we should use them here. // If we know we cannot communicate or write to memory, we do not care about @@ -2992,7 +3770,7 @@ struct AANoCaptureImpl : public AANoCapture { // Check existing "returned" attributes. int ArgNo = IRP.getArgNo(); if (F.doesNotThrow() && ArgNo >= 0) { - for (unsigned u = 0, e = F.arg_size(); u< e; ++u) + for (unsigned u = 0, e = F.arg_size(); u < e; ++u) if (F.hasParamAttribute(u, Attribute::Returned)) { if (u == unsigned(ArgNo)) State.removeAssumedBits(NOT_CAPTURED_IN_RET); @@ -3036,7 +3814,7 @@ struct AACaptureUseTracker final : public CaptureTracker { /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger /// conservatively set to true. AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA, - const AAIsDead &IsDeadAA, IntegerState &State, + const AAIsDead &IsDeadAA, AANoCapture::StateType &State, SmallVectorImpl<const Value *> &PotentialCopies, unsigned &RemainingUsesToExplore) : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State), @@ -3155,7 +3933,7 @@ private: const AAIsDead &IsDeadAA; /// The state currently updated. - IntegerState &State; + AANoCapture::StateType &State; /// Set of potential copies of the tracked value. SmallVectorImpl<const Value *> &PotentialCopies; @@ -3238,9 +4016,11 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size()) Tracker.valueMayBeCaptured(PotentialCopies[Idx++]); - AAAlign::StateType &S = getState(); + AANoCapture::StateType &S = getState(); auto Assumed = S.getAssumed(); S.intersectAssumedBits(T.getAssumed()); + if (!isAssumedNoCaptureMaybeReturned()) + return indicatePessimisticFixpoint(); return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } @@ -3257,6 +4037,14 @@ struct AANoCaptureArgument final : AANoCaptureImpl { struct AANoCaptureCallSiteArgument final : AANoCaptureImpl { AANoCaptureCallSiteArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {} + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (Argument *Arg = getAssociatedArgument()) + if (Arg->hasByValAttr()) + indicateOptimisticFixpoint(); + AANoCaptureImpl::initialize(A); + } + /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { // TODO: Once we have call site specific value information we can provide @@ -3375,6 +4163,28 @@ struct AAValueSimplifyImpl : AAValueSimplify { return true; } + bool askSimplifiedValueForAAValueConstantRange(Attributor &A) { + if (!getAssociatedValue().getType()->isIntegerTy()) + return false; + + const auto &ValueConstantRangeAA = + A.getAAFor<AAValueConstantRange>(*this, getIRPosition()); + + Optional<ConstantInt *> COpt = + ValueConstantRangeAA.getAssumedConstantInt(A); + if (COpt.hasValue()) { + if (auto *C = COpt.getValue()) + SimplifiedAssociatedValue = C; + else + return false; + } else { + // FIXME: It should be llvm::None but if you set llvm::None, + // values are mistakenly infered as `undef` now. + SimplifiedAssociatedValue = &getAssociatedValue(); + } + return true; + } + /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { ChangeStatus Changed = ChangeStatus::UNCHANGED; @@ -3389,7 +4199,7 @@ struct AAValueSimplifyImpl : AAValueSimplify { if (!V.user_empty() && &V != C && V.getType() == C->getType()) { LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << V << " -> " << *C << "\n"); - V.replaceAllUsesWith(C); + A.changeValueAfterManifest(V, *C); Changed = ChangeStatus::CHANGED; } } @@ -3397,6 +4207,15 @@ struct AAValueSimplifyImpl : AAValueSimplify { return Changed | AAValueSimplify::manifest(A); } + /// See AbstractState::indicatePessimisticFixpoint(...). + ChangeStatus indicatePessimisticFixpoint() override { + // NOTE: Associated value will be returned in a pessimistic fixpoint and is + // regarded as known. That's why`indicateOptimisticFixpoint` is called. + SimplifiedAssociatedValue = &getAssociatedValue(); + indicateOptimisticFixpoint(); + return ChangeStatus::CHANGED; + } + protected: // An assumed simplified value. Initially, it is set to Optional::None, which // means that the value is not clear under current assumption. If in the @@ -3408,20 +4227,49 @@ protected: struct AAValueSimplifyArgument final : AAValueSimplifyImpl { AAValueSimplifyArgument(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {} + void initialize(Attributor &A) override { + AAValueSimplifyImpl::initialize(A); + if (!getAssociatedFunction() || getAssociatedFunction()->isDeclaration()) + indicatePessimisticFixpoint(); + if (hasAttr({Attribute::InAlloca, Attribute::StructRet, Attribute::Nest}, + /* IgnoreSubsumingPositions */ true)) + indicatePessimisticFixpoint(); + } + /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { + // Byval is only replacable if it is readonly otherwise we would write into + // the replaced value and not the copy that byval creates implicitly. + Argument *Arg = getAssociatedArgument(); + if (Arg->hasByValAttr()) { + const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition()); + if (!MemAA.isAssumedReadOnly()) + return indicatePessimisticFixpoint(); + } + bool HasValueBefore = SimplifiedAssociatedValue.hasValue(); auto PredForCallSite = [&](AbstractCallSite ACS) { // Check if we have an associated argument or not (which can happen for // callback calls). - if (Value *ArgOp = ACS.getCallArgOperand(getArgNo())) - return checkAndUpdate(A, *this, *ArgOp, SimplifiedAssociatedValue); - return false; + Value *ArgOp = ACS.getCallArgOperand(getArgNo()); + if (!ArgOp) + return false; + // We can only propagate thread independent values through callbacks. + // This is different to direct/indirect call sites because for them we + // know the thread executing the caller and callee is the same. For + // callbacks this is not guaranteed, thus a thread dependent value could + // be different for the caller and callee, making it invalid to propagate. + if (ACS.isCallbackCall()) + if (auto *C = dyn_cast<Constant>(ArgOp)) + if (C->isThreadDependent()) + return false; + return checkAndUpdate(A, *this, *ArgOp, SimplifiedAssociatedValue); }; if (!A.checkForAllCallSites(PredForCallSite, *this, true)) - return indicatePessimisticFixpoint(); + if (!askSimplifiedValueForAAValueConstantRange(A)) + return indicatePessimisticFixpoint(); // If a candicate was found in this update, return CHANGED. return HasValueBefore == SimplifiedAssociatedValue.hasValue() @@ -3447,7 +4295,8 @@ struct AAValueSimplifyReturned : AAValueSimplifyImpl { }; if (!A.checkForAllReturnedValues(PredForReturned, *this)) - return indicatePessimisticFixpoint(); + if (!askSimplifiedValueForAAValueConstantRange(A)) + return indicatePessimisticFixpoint(); // If a candicate was found in this update, return CHANGED. return HasValueBefore == SimplifiedAssociatedValue.hasValue() @@ -3468,7 +4317,7 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { Value &V = getAnchorValue(); // TODO: add other stuffs - if (isa<Constant>(V) || isa<UndefValue>(V)) + if (isa<Constant>(V)) indicatePessimisticFixpoint(); } @@ -3480,10 +4329,10 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V)); if (!Stripped && this == &AA) { // TODO: Look the instruction and check recursively. + LLVM_DEBUG( dbgs() << "[Attributor][ValueSimplify] Can't be stripped more : " << V << "\n"); - indicatePessimisticFixpoint(); return false; } return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue); @@ -3492,7 +4341,8 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl { if (!genericValueTraversal<AAValueSimplify, BooleanState>( A, getIRPosition(), *this, static_cast<BooleanState &>(*this), VisitValueCB)) - return indicatePessimisticFixpoint(); + if (!askSimplifiedValueForAAValueConstantRange(A)) + return indicatePessimisticFixpoint(); // If a candicate was found in this update, return CHANGED. @@ -3601,7 +4451,7 @@ struct AAHeapToStackImpl : public AAHeapToStack { AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc", AI->getNextNode()); - MallocCall->replaceAllUsesWith(AI); + replaceAllInstructionUsesWith(*MallocCall, *AI); if (auto *II = dyn_cast<InvokeInst>(MallocCall)) { auto *NBB = II->getNormalDest(); @@ -3645,76 +4495,80 @@ ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) { const Function *F = getAssociatedFunction(); const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); - auto UsesCheck = [&](Instruction &I) { - SmallPtrSet<const Use *, 8> Visited; - SmallVector<const Use *, 8> Worklist; - - for (Use &U : I.uses()) - Worklist.push_back(&U); + MustBeExecutedContextExplorer &Explorer = + A.getInfoCache().getMustBeExecutedContextExplorer(); - while (!Worklist.empty()) { - const Use *U = Worklist.pop_back_val(); - if (!Visited.insert(U).second) - continue; - - auto *UserI = U->getUser(); + auto FreeCheck = [&](Instruction &I) { + const auto &Frees = FreesForMalloc.lookup(&I); + if (Frees.size() != 1) + return false; + Instruction *UniqueFree = *Frees.begin(); + return Explorer.findInContextOf(UniqueFree, I.getNextNode()); + }; + auto UsesCheck = [&](Instruction &I) { + bool ValidUsesOnly = true; + bool MustUse = true; + auto Pred = [&](const Use &U, bool &Follow) -> bool { + Instruction *UserI = cast<Instruction>(U.getUser()); if (isa<LoadInst>(UserI)) - continue; + return true; if (auto *SI = dyn_cast<StoreInst>(UserI)) { - if (SI->getValueOperand() == U->get()) { - LLVM_DEBUG(dbgs() << "[H2S] escaping store to memory: " << *UserI << "\n"); - return false; + if (SI->getValueOperand() == U.get()) { + LLVM_DEBUG(dbgs() + << "[H2S] escaping store to memory: " << *UserI << "\n"); + ValidUsesOnly = false; + } else { + // A store into the malloc'ed memory is fine. } - // A store into the malloc'ed memory is fine. - continue; + return true; } - - // NOTE: Right now, if a function that has malloc pointer as an argument - // frees memory, we assume that the malloc pointer is freed. - - // TODO: Add nofree callsite argument attribute to indicate that pointer - // argument is not freed. if (auto *CB = dyn_cast<CallBase>(UserI)) { - if (!CB->isArgOperand(U)) - continue; - - if (CB->isLifetimeStartOrEnd()) - continue; - + if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd()) + return true; // Record malloc. if (isFreeCall(UserI, TLI)) { - FreesForMalloc[&I].insert( - cast<Instruction>(const_cast<User *>(UserI))); - continue; + if (MustUse) { + FreesForMalloc[&I].insert(UserI); + } else { + LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: " + << *UserI << "\n"); + ValidUsesOnly = false; + } + return true; } - // If a function does not free memory we are fine - const auto &NoFreeAA = - A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(*CB)); + unsigned ArgNo = CB->getArgOperandNo(&U); - unsigned ArgNo = U - CB->arg_begin(); const auto &NoCaptureAA = A.getAAFor<AANoCapture>( *this, IRPosition::callsite_argument(*CB, ArgNo)); - if (!NoCaptureAA.isAssumedNoCapture() || !NoFreeAA.isAssumedNoFree()) { + // If a callsite argument use is nofree, we are fine. + const auto &ArgNoFreeAA = A.getAAFor<AANoFree>( + *this, IRPosition::callsite_argument(*CB, ArgNo)); + + if (!NoCaptureAA.isAssumedNoCapture() || + !ArgNoFreeAA.isAssumedNoFree()) { LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n"); - return false; + ValidUsesOnly = false; } - continue; + return true; } - if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI)) { - for (Use &U : UserI->uses()) - Worklist.push_back(&U); - continue; + if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) || + isa<PHINode>(UserI) || isa<SelectInst>(UserI)) { + MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI)); + Follow = true; + return true; } - - // Unknown user. + // Unknown user for which we can not track uses further (in a way that + // makes sense). LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n"); - return false; - } - return true; + ValidUsesOnly = false; + return true; + }; + A.checkForAllUses(Pred, *this, I); + return ValidUsesOnly; }; auto MallocCallocCheck = [&](Instruction &I) { @@ -3730,8 +4584,8 @@ ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) { if (IsMalloc) { if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0))) - if (Size->getValue().sle(MaxHeapToStackSize)) - if (UsesCheck(I)) { + if (Size->getValue().ule(MaxHeapToStackSize)) + if (UsesCheck(I) || FreeCheck(I)) { MallocCalls.insert(&I); return true; } @@ -3740,8 +4594,8 @@ ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) { if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0))) if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1))) if ((Size->getValue().umul_ov(Num->getValue(), Overflow)) - .sle(MaxHeapToStackSize)) - if (!Overflow && UsesCheck(I)) { + .ule(MaxHeapToStackSize)) + if (!Overflow && (UsesCheck(I) || FreeCheck(I))) { MallocCalls.insert(&I); return true; } @@ -3767,8 +4621,10 @@ struct AAHeapToStackFunction final : public AAHeapToStackImpl { /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { STATS_DECL(MallocCalls, Function, - "Number of MallocCalls converted to allocas"); - BUILD_STAT_NAME(MallocCalls, Function) += MallocCalls.size(); + "Number of malloc calls converted to allocas"); + for (auto *C : MallocCalls) + if (!BadMallocCalls.count(C)) + ++BUILD_STAT_NAME(MallocCalls, Function); } }; @@ -3787,9 +4643,10 @@ struct AAMemoryBehaviorImpl : public AAMemoryBehavior { /// Return the memory behavior information encoded in the IR for \p IRP. static void getKnownStateFromValue(const IRPosition &IRP, - IntegerState &State) { + BitIntegerState &State, + bool IgnoreSubsumingPositions = false) { SmallVector<Attribute, 2> Attrs; - IRP.getAttrs(AttrKinds, Attrs); + IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions); for (const Attribute &Attr : Attrs) { switch (Attr.getKindAsEnum()) { case Attribute::ReadNone: @@ -3829,7 +4686,7 @@ struct AAMemoryBehaviorImpl : public AAMemoryBehavior { /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { - IRPosition &IRP = getIRPosition(); + const IRPosition &IRP = getIRPosition(); // Check if we would improve the existing attributes first. SmallVector<Attribute, 4> DeducedAttrs; @@ -3911,12 +4768,25 @@ struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - AAMemoryBehaviorFloating::initialize(A); + intersectAssumedBits(BEST_STATE); + const IRPosition &IRP = getIRPosition(); + // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we + // can query it when we use has/getAttr. That would allow us to reuse the + // initialize of the base class here. + bool HasByVal = + IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true); + getKnownStateFromValue(IRP, getState(), + /* IgnoreSubsumingPositions */ HasByVal); // Initialize the use vector with all direct uses of the associated value. Argument *Arg = getAssociatedArgument(); - if (!Arg || !Arg->getParent()->hasExactDefinition()) + if (!Arg || !Arg->getParent()->hasExactDefinition()) { indicatePessimisticFixpoint(); + } else { + // Initialize the use vector with all direct uses of the associated value. + for (const Use &U : Arg->uses()) + Uses.insert(&U); + } } ChangeStatus manifest(Attributor &A) override { @@ -3929,7 +4799,6 @@ struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating { return AAMemoryBehaviorFloating::manifest(A); } - /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { if (isAssumedReadNone()) @@ -3945,6 +4814,19 @@ struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument { AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP) : AAMemoryBehaviorArgument(IRP) {} + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (Argument *Arg = getAssociatedArgument()) { + if (Arg->hasByValAttr()) { + addKnownBits(NO_WRITES); + removeKnownBits(NO_READS); + removeAssumedBits(NO_READS); + } + } else { + } + AAMemoryBehaviorArgument::initialize(A); + } + /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { // TODO: Once we have call site specific value information we can provide @@ -3956,7 +4838,7 @@ struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument { auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos); return clampStateAndIndicateChange( getState(), - static_cast<const AANoCapture::StateType &>(ArgAA.getState())); + static_cast<const AAMemoryBehavior::StateType &>(ArgAA.getState())); } /// See AbstractAttribute::trackStatistics() @@ -4036,7 +4918,8 @@ struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl { const IRPosition &FnPos = IRPosition::function(*F); auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos); return clampStateAndIndicateChange( - getState(), static_cast<const AAAlign::StateType &>(FnAA.getState())); + getState(), + static_cast<const AAMemoryBehavior::StateType &>(FnAA.getState())); } /// See AbstractAttribute::trackStatistics() @@ -4090,19 +4973,26 @@ ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) { // First, check the function scope. We take the known information and we avoid // work if the assumed information implies the current assumed information for - // this attribute. - const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos); - S.addKnownBits(FnMemAA.getKnown()); - if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed()) - return ChangeStatus::UNCHANGED; + // this attribute. This is a valid for all but byval arguments. + Argument *Arg = IRP.getAssociatedArgument(); + AAMemoryBehavior::base_t FnMemAssumedState = + AAMemoryBehavior::StateType::getWorstState(); + if (!Arg || !Arg->hasByValAttr()) { + const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos); + FnMemAssumedState = FnMemAA.getAssumed(); + S.addKnownBits(FnMemAA.getKnown()); + if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed()) + return ChangeStatus::UNCHANGED; + } // Make sure the value is not captured (except through "return"), if // it is, any information derived would be irrelevant anyway as we cannot // check the potential aliases introduced by the capture. However, no need // to fall back to anythign less optimistic than the function state. - const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP); + const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>( + *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL); if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) { - S.intersectAssumedBits(FnMemAA.getAssumed()); + S.intersectAssumedBits(FnMemAssumedState); return ChangeStatus::CHANGED; } @@ -4223,7 +5113,451 @@ void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U, if (UserI->mayWriteToMemory()) removeAssumedBits(NO_WRITES); } +/// ------------------ Value Constant Range Attribute ------------------------- + +struct AAValueConstantRangeImpl : AAValueConstantRange { + using StateType = IntegerRangeState; + AAValueConstantRangeImpl(const IRPosition &IRP) : AAValueConstantRange(IRP) {} + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + std::string Str; + llvm::raw_string_ostream OS(Str); + OS << "range(" << getBitWidth() << ")<"; + getKnown().print(OS); + OS << " / "; + getAssumed().print(OS); + OS << ">"; + return OS.str(); + } + + /// Helper function to get a SCEV expr for the associated value at program + /// point \p I. + const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const { + if (!getAnchorScope()) + return nullptr; + + ScalarEvolution *SE = + A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>( + *getAnchorScope()); + + LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>( + *getAnchorScope()); + + if (!SE || !LI) + return nullptr; + + const SCEV *S = SE->getSCEV(&getAssociatedValue()); + if (!I) + return S; + + return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent())); + } + + /// Helper function to get a range from SCEV for the associated value at + /// program point \p I. + ConstantRange getConstantRangeFromSCEV(Attributor &A, + const Instruction *I = nullptr) const { + if (!getAnchorScope()) + return getWorstState(getBitWidth()); + + ScalarEvolution *SE = + A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>( + *getAnchorScope()); + + const SCEV *S = getSCEV(A, I); + if (!SE || !S) + return getWorstState(getBitWidth()); + + return SE->getUnsignedRange(S); + } + + /// Helper function to get a range from LVI for the associated value at + /// program point \p I. + ConstantRange + getConstantRangeFromLVI(Attributor &A, + const Instruction *CtxI = nullptr) const { + if (!getAnchorScope()) + return getWorstState(getBitWidth()); + + LazyValueInfo *LVI = + A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>( + *getAnchorScope()); + + if (!LVI || !CtxI) + return getWorstState(getBitWidth()); + return LVI->getConstantRange(&getAssociatedValue(), + const_cast<BasicBlock *>(CtxI->getParent()), + const_cast<Instruction *>(CtxI)); + } + + /// See AAValueConstantRange::getKnownConstantRange(..). + ConstantRange + getKnownConstantRange(Attributor &A, + const Instruction *CtxI = nullptr) const override { + if (!CtxI || CtxI == getCtxI()) + return getKnown(); + + ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI); + ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI); + return getKnown().intersectWith(SCEVR).intersectWith(LVIR); + } + + /// See AAValueConstantRange::getAssumedConstantRange(..). + ConstantRange + getAssumedConstantRange(Attributor &A, + const Instruction *CtxI = nullptr) const override { + // TODO: Make SCEV use Attributor assumption. + // We may be able to bound a variable range via assumptions in + // Attributor. ex.) If x is assumed to be in [1, 3] and y is known to + // evolve to x^2 + x, then we can say that y is in [2, 12]. + + if (!CtxI || CtxI == getCtxI()) + return getAssumed(); + + ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI); + ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI); + return getAssumed().intersectWith(SCEVR).intersectWith(LVIR); + } + + /// See AbstractAttribute::initialize(..). + void initialize(Attributor &A) override { + // Intersect a range given by SCEV. + intersectKnown(getConstantRangeFromSCEV(A, getCtxI())); + + // Intersect a range given by LVI. + intersectKnown(getConstantRangeFromLVI(A, getCtxI())); + } + + /// Helper function to create MDNode for range metadata. + static MDNode * + getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx, + const ConstantRange &AssumedConstantRange) { + Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get( + Ty, AssumedConstantRange.getLower())), + ConstantAsMetadata::get(ConstantInt::get( + Ty, AssumedConstantRange.getUpper()))}; + return MDNode::get(Ctx, LowAndHigh); + } + + /// Return true if \p Assumed is included in \p KnownRanges. + static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) { + + if (Assumed.isFullSet()) + return false; + + if (!KnownRanges) + return true; + + // If multiple ranges are annotated in IR, we give up to annotate assumed + // range for now. + + // TODO: If there exists a known range which containts assumed range, we + // can say assumed range is better. + if (KnownRanges->getNumOperands() > 2) + return false; + + ConstantInt *Lower = + mdconst::extract<ConstantInt>(KnownRanges->getOperand(0)); + ConstantInt *Upper = + mdconst::extract<ConstantInt>(KnownRanges->getOperand(1)); + + ConstantRange Known(Lower->getValue(), Upper->getValue()); + return Known.contains(Assumed) && Known != Assumed; + } + + /// Helper function to set range metadata. + static bool + setRangeMetadataIfisBetterRange(Instruction *I, + const ConstantRange &AssumedConstantRange) { + auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range); + if (isBetterRange(AssumedConstantRange, OldRangeMD)) { + if (!AssumedConstantRange.isEmptySet()) { + I->setMetadata(LLVMContext::MD_range, + getMDNodeForConstantRange(I->getType(), I->getContext(), + AssumedConstantRange)); + return true; + } + } + return false; + } + + /// See AbstractAttribute::manifest() + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Changed = ChangeStatus::UNCHANGED; + ConstantRange AssumedConstantRange = getAssumedConstantRange(A); + assert(!AssumedConstantRange.isFullSet() && "Invalid state"); + + auto &V = getAssociatedValue(); + if (!AssumedConstantRange.isEmptySet() && + !AssumedConstantRange.isSingleElement()) { + if (Instruction *I = dyn_cast<Instruction>(&V)) + if (isa<CallInst>(I) || isa<LoadInst>(I)) + if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange)) + Changed = ChangeStatus::CHANGED; + } + + return Changed; + } +}; + +struct AAValueConstantRangeArgument final : public AAValueConstantRangeImpl { + + AAValueConstantRangeArgument(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Use AAArgumentFromCallSiteArguments + + IntegerRangeState S(getBitWidth()); + clampCallSiteArgumentStates<AAValueConstantRange, IntegerRangeState>( + A, *this, S); + + // TODO: If we know we visited all incoming values, thus no are assumed + // dead, we can take the known information from the state T. + return clampStateAndIndicateChange<IntegerRangeState>(this->getState(), S); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_ARG_ATTR(value_range) + } +}; + +struct AAValueConstantRangeReturned : AAValueConstantRangeImpl { + AAValueConstantRangeReturned(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Use AAReturnedFromReturnedValues + + // TODO: If we know we visited all returned values, thus no are assumed + // dead, we can take the known information from the state T. + + IntegerRangeState S(getBitWidth()); + + clampReturnedValueStates<AAValueConstantRange, IntegerRangeState>(A, *this, + S); + return clampStateAndIndicateChange<StateType>(this->getState(), S); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FNRET_ATTR(value_range) + } +}; + +struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { + AAValueConstantRangeFloating(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAValueConstantRange::initialize(A); + Value &V = getAssociatedValue(); + + if (auto *C = dyn_cast<ConstantInt>(&V)) { + unionAssumed(ConstantRange(C->getValue())); + indicateOptimisticFixpoint(); + return; + } + + if (isa<UndefValue>(&V)) { + indicateOptimisticFixpoint(); + return; + } + + if (auto *I = dyn_cast<Instruction>(&V)) + if (isa<BinaryOperator>(I) || isa<CmpInst>(I)) { + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + + if (LHS->getType()->isIntegerTy() && RHS->getType()->isIntegerTy()) + return; + } + + // If it is a load instruction with range metadata, use it. + if (LoadInst *LI = dyn_cast<LoadInst>(&V)) + if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) { + intersectKnown(getConstantRangeFromMetadata(*RangeMD)); + return; + } + + // Otherwise we give up. + indicatePessimisticFixpoint(); + + LLVM_DEBUG(dbgs() << "[Attributor][AAValueConstantRange] We give up: " + << getAssociatedValue()); + } + + bool calculateBinaryOperator(Attributor &A, BinaryOperator *BinOp, + IntegerRangeState &T, Instruction *CtxI) { + Value *LHS = BinOp->getOperand(0); + Value *RHS = BinOp->getOperand(1); + + auto &LHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS)); + auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI); + + auto &RHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS)); + auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI); + + auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange); + + T.unionAssumed(AssumedRange); + + // TODO: Track a known state too. + + return T.isValidState(); + } + + bool calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T, + Instruction *CtxI) { + Value *LHS = CmpI->getOperand(0); + Value *RHS = CmpI->getOperand(1); + + auto &LHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS)); + auto &RHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS)); + + auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI); + auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI); + + // If one of them is empty set, we can't decide. + if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet()) + return true; + + bool MustTrue = false, MustFalse = false; + + auto AllowedRegion = + ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange); + + auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion( + CmpI->getPredicate(), RHSAARange); + + if (AllowedRegion.intersectWith(LHSAARange).isEmptySet()) + MustFalse = true; + + if (SatisfyingRegion.contains(LHSAARange)) + MustTrue = true; + + assert((!MustTrue || !MustFalse) && + "Either MustTrue or MustFalse should be false!"); + + if (MustTrue) + T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1))); + else if (MustFalse) + T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0))); + else + T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true)); + + LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA + << " " << RHSAA << "\n"); + + // TODO: Track a known state too. + return T.isValidState(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + Instruction *CtxI = getCtxI(); + auto VisitValueCB = [&](Value &V, IntegerRangeState &T, + bool Stripped) -> bool { + Instruction *I = dyn_cast<Instruction>(&V); + if (!I) { + + // If the value is not instruction, we query AA to Attributor. + const auto &AA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V)); + + // Clamp operator is not used to utilize a program point CtxI. + T.unionAssumed(AA.getAssumedConstantRange(A, CtxI)); + + return T.isValidState(); + } + + if (auto *BinOp = dyn_cast<BinaryOperator>(I)) + return calculateBinaryOperator(A, BinOp, T, CtxI); + else if (auto *CmpI = dyn_cast<CmpInst>(I)) + return calculateCmpInst(A, CmpI, T, CtxI); + else { + // Give up with other instructions. + // TODO: Add other instructions + T.indicatePessimisticFixpoint(); + return false; + } + }; + + IntegerRangeState T(getBitWidth()); + + if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>( + A, getIRPosition(), *this, T, VisitValueCB)) + return indicatePessimisticFixpoint(); + + return clampStateAndIndicateChange(getState(), T); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(value_range) + } +}; + +struct AAValueConstantRangeFunction : AAValueConstantRangeImpl { + AAValueConstantRangeFunction(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + ChangeStatus updateImpl(Attributor &A) override { + llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will " + "not be called"); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) } +}; + +struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction { + AAValueConstantRangeCallSite(const IRPosition &IRP) + : AAValueConstantRangeFunction(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) } +}; + +struct AAValueConstantRangeCallSiteReturned : AAValueConstantRangeReturned { + AAValueConstantRangeCallSiteReturned(const IRPosition &IRP) + : AAValueConstantRangeReturned(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + // If it is a load instruction with range metadata, use the metadata. + if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue())) + if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range)) + intersectKnown(getConstantRangeFromMetadata(*RangeMD)); + + AAValueConstantRangeReturned::initialize(A); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CSRET_ATTR(value_range) + } +}; +struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating { + AAValueConstantRangeCallSiteArgument(const IRPosition &IRP) + : AAValueConstantRangeFloating(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CSARG_ATTR(value_range) + } +}; /// ---------------------------------------------------------------------------- /// Attributor /// ---------------------------------------------------------------------------- @@ -4234,6 +5568,8 @@ bool Attributor::isAssumedDead(const AbstractAttribute &AA, if (!CtxI) return false; + // TODO: Find a good way to utilize fine and coarse grained liveness + // information. if (!LivenessAA) LivenessAA = &getAAFor<AAIsDead>(AA, IRPosition::function(*CtxI->getFunction()), @@ -4247,7 +5583,58 @@ bool Attributor::isAssumedDead(const AbstractAttribute &AA, return false; // We actually used liveness information so we have to record a dependence. - recordDependence(*LivenessAA, AA); + recordDependence(*LivenessAA, AA, DepClassTy::OPTIONAL); + + return true; +} + +bool Attributor::checkForAllUses( + const function_ref<bool(const Use &, bool &)> &Pred, + const AbstractAttribute &QueryingAA, const Value &V) { + const IRPosition &IRP = QueryingAA.getIRPosition(); + SmallVector<const Use *, 16> Worklist; + SmallPtrSet<const Use *, 16> Visited; + + for (const Use &U : V.uses()) + Worklist.push_back(&U); + + LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size() + << " initial uses to check\n"); + + if (Worklist.empty()) + return true; + + bool AnyDead = false; + const Function *ScopeFn = IRP.getAnchorScope(); + const auto *LivenessAA = + ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn), + /* TrackDependence */ false) + : nullptr; + + while (!Worklist.empty()) { + const Use *U = Worklist.pop_back_val(); + if (!Visited.insert(U).second) + continue; + LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << "\n"); + if (Instruction *UserI = dyn_cast<Instruction>(U->getUser())) + if (LivenessAA && LivenessAA->isAssumedDead(UserI)) { + LLVM_DEBUG(dbgs() << "[Attributor] Dead user: " << *UserI << ": " + << *LivenessAA << "\n"); + AnyDead = true; + continue; + } + + bool Follow = false; + if (!Pred(*U, Follow)) + return false; + if (!Follow) + continue; + for (const Use &UU : U->getUser()->uses()) + Worklist.push_back(&UU); + } + + if (AnyDead) + recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL); return true; } @@ -4284,10 +5671,12 @@ bool Attributor::checkForAllCallSites( for (const Use &U : Fn.uses()) { AbstractCallSite ACS(&U); if (!ACS) { - LLVM_DEBUG(dbgs() << "[Attributor] Function " - << Fn.getName() + LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName() << " has non call site use " << *U.get() << " in " << *U.getUser() << "\n"); + // BlockAddress users are allowed. + if (isa<BlockAddress>(U.getUser())) + continue; return false; } @@ -4296,14 +5685,14 @@ bool Attributor::checkForAllCallSites( const auto *LivenessAA = lookupAAFor<AAIsDead>(IRPosition::function(*Caller), QueryingAA, - /* TrackDependence */ false); + /* TrackDependence */ false); // Skip dead calls. if (LivenessAA && LivenessAA->isAssumedDead(I)) { // We actually used liveness information so we have to record a // dependence. if (QueryingAA) - recordDependence(*LivenessAA, *QueryingAA); + recordDependence(*LivenessAA, *QueryingAA, DepClassTy::OPTIONAL); continue; } @@ -4313,8 +5702,7 @@ bool Attributor::checkForAllCallSites( if (!RequireAllCallSites) continue; LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser() - << " is an invalid use of " - << Fn.getName() << "\n"); + << " is an invalid use of " << Fn.getName() << "\n"); return false; } @@ -4417,7 +5805,7 @@ bool Attributor::checkForAllInstructions( // If we actually used liveness information so we have to record a dependence. if (AnyDead) - recordDependence(LivenessAA, QueryingAA); + recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL); return true; } @@ -4451,7 +5839,7 @@ bool Attributor::checkForAllReadWriteInstructions( // If we actually used liveness information so we have to record a dependence. if (AnyDead) - recordDependence(LivenessAA, QueryingAA); + recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL); return true; } @@ -4467,7 +5855,7 @@ ChangeStatus Attributor::run(Module &M) { unsigned IterationCounter = 1; SmallVector<AbstractAttribute *, 64> ChangedAAs; - SetVector<AbstractAttribute *> Worklist; + SetVector<AbstractAttribute *> Worklist, InvalidAAs; Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end()); bool RecomputeDependences = false; @@ -4478,6 +5866,29 @@ ChangeStatus Attributor::run(Module &M) { LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter << ", Worklist size: " << Worklist.size() << "\n"); + // For invalid AAs we can fix dependent AAs that have a required dependence, + // thereby folding long dependence chains in a single step without the need + // to run updates. + for (unsigned u = 0; u < InvalidAAs.size(); ++u) { + AbstractAttribute *InvalidAA = InvalidAAs[u]; + auto &QuerriedAAs = QueryMap[InvalidAA]; + LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has " + << QuerriedAAs.RequiredAAs.size() << "/" + << QuerriedAAs.OptionalAAs.size() + << " required/optional dependences\n"); + for (AbstractAttribute *DepOnInvalidAA : QuerriedAAs.RequiredAAs) { + AbstractState &DOIAAState = DepOnInvalidAA->getState(); + DOIAAState.indicatePessimisticFixpoint(); + ++NumAttributesFixedDueToRequiredDependences; + assert(DOIAAState.isAtFixpoint() && "Expected fixpoint state!"); + if (!DOIAAState.isValidState()) + InvalidAAs.insert(DepOnInvalidAA); + } + if (!RecomputeDependences) + Worklist.insert(QuerriedAAs.OptionalAAs.begin(), + QuerriedAAs.OptionalAAs.end()); + } + // If dependences (=QueryMap) are recomputed we have to look at all abstract // attributes again, regardless of what changed in the last iteration. if (RecomputeDependences) { @@ -4493,22 +5904,35 @@ ChangeStatus Attributor::run(Module &M) { // changed to the work list. for (AbstractAttribute *ChangedAA : ChangedAAs) { auto &QuerriedAAs = QueryMap[ChangedAA]; - Worklist.insert(QuerriedAAs.begin(), QuerriedAAs.end()); + Worklist.insert(QuerriedAAs.OptionalAAs.begin(), + QuerriedAAs.OptionalAAs.end()); + Worklist.insert(QuerriedAAs.RequiredAAs.begin(), + QuerriedAAs.RequiredAAs.end()); } LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter << ", Worklist+Dependent size: " << Worklist.size() << "\n"); - // Reset the changed set. + // Reset the changed and invalid set. ChangedAAs.clear(); + InvalidAAs.clear(); // Update all abstract attribute in the work list and record the ones that // changed. for (AbstractAttribute *AA : Worklist) - if (!isAssumedDead(*AA, nullptr)) - if (AA->update(*this) == ChangeStatus::CHANGED) + if (!AA->getState().isAtFixpoint() && !isAssumedDead(*AA, nullptr)) { + QueriedNonFixAA = false; + if (AA->update(*this) == ChangeStatus::CHANGED) { ChangedAAs.push_back(AA); + if (!AA->getState().isValidState()) + InvalidAAs.insert(AA); + } else if (!QueriedNonFixAA) { + // If the attribute did not query any non-fix information, the state + // will not change and we can indicate that right away. + AA->getState().indicateOptimisticFixpoint(); + } + } // Check if we recompute the dependences in the next iteration. RecomputeDependences = (DepRecomputeInterval > 0 && @@ -4552,7 +5976,10 @@ ChangeStatus Attributor::run(Module &M) { } auto &QuerriedAAs = QueryMap[ChangedAA]; - ChangedAAs.append(QuerriedAAs.begin(), QuerriedAAs.end()); + ChangedAAs.append(QuerriedAAs.OptionalAAs.begin(), + QuerriedAAs.OptionalAAs.end()); + ChangedAAs.append(QuerriedAAs.RequiredAAs.begin(), + QuerriedAAs.RequiredAAs.end()); } LLVM_DEBUG({ @@ -4611,27 +6038,85 @@ ChangeStatus Attributor::run(Module &M) { LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least " << ToBeDeletedFunctions.size() << " functions and " << ToBeDeletedBlocks.size() << " blocks and " - << ToBeDeletedInsts.size() << " instructions\n"); + << ToBeDeletedInsts.size() << " instructions and " + << ToBeChangedUses.size() << " uses\n"); + + SmallVector<Instruction *, 32> DeadInsts; + SmallVector<Instruction *, 32> TerminatorsToFold; + + for (auto &It : ToBeChangedUses) { + Use *U = It.first; + Value *NewV = It.second; + Value *OldV = U->get(); + LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser() + << " instead of " << *OldV << "\n"); + U->set(NewV); + if (Instruction *I = dyn_cast<Instruction>(OldV)) + if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) && + isInstructionTriviallyDead(I)) { + DeadInsts.push_back(I); + } + if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) { + Instruction *UserI = cast<Instruction>(U->getUser()); + if (isa<UndefValue>(NewV)) { + ToBeChangedToUnreachableInsts.insert(UserI); + } else { + TerminatorsToFold.push_back(UserI); + } + } + } + for (auto &V : InvokeWithDeadSuccessor) + if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) { + bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind); + bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn); + bool Invoke2CallAllowed = + !AAIsDeadFunction::mayCatchAsynchronousExceptions( + *II->getFunction()); + assert((UnwindBBIsDead || NormalBBIsDead) && + "Invoke does not have dead successors!"); + BasicBlock *BB = II->getParent(); + BasicBlock *NormalDestBB = II->getNormalDest(); + if (UnwindBBIsDead) { + Instruction *NormalNextIP = &NormalDestBB->front(); + if (Invoke2CallAllowed) { + changeToCall(II); + NormalNextIP = BB->getTerminator(); + } + if (NormalBBIsDead) + ToBeChangedToUnreachableInsts.insert(NormalNextIP); + } else { + assert(NormalBBIsDead && "Broken invariant!"); + if (!NormalDestBB->getUniquePredecessor()) + NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead"); + ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front()); + } + } + for (auto &V : ToBeChangedToUnreachableInsts) + if (Instruction *I = dyn_cast_or_null<Instruction>(V)) + changeToUnreachable(I, /* UseLLVMTrap */ false); + for (Instruction *I : TerminatorsToFold) + ConstantFoldTerminator(I->getParent()); + for (Instruction *I : ToBeDeletedInsts) { - if (!I->use_empty()) - I->replaceAllUsesWith(UndefValue::get(I->getType())); - I->eraseFromParent(); + I->replaceAllUsesWith(UndefValue::get(I->getType())); + if (!isa<PHINode>(I) && isInstructionTriviallyDead(I)) + DeadInsts.push_back(I); + else + I->eraseFromParent(); } + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); + if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) { SmallVector<BasicBlock *, 8> ToBeDeletedBBs; ToBeDeletedBBs.reserve(NumDeadBlocks); ToBeDeletedBBs.append(ToBeDeletedBlocks.begin(), ToBeDeletedBlocks.end()); - DeleteDeadBlocks(ToBeDeletedBBs); - STATS_DECLTRACK(AAIsDead, BasicBlock, - "Number of dead basic blocks deleted."); - } - - STATS_DECL(AAIsDead, Function, "Number of dead functions deleted."); - for (Function *Fn : ToBeDeletedFunctions) { - Fn->replaceAllUsesWith(UndefValue::get(Fn->getType())); - Fn->eraseFromParent(); - STATS_TRACK(AAIsDead, Function); + // Actually we do not delete the blocks but squash them into a single + // unreachable but untangling branches that jump here is something we need + // to do in a more generic way. + DetatchDeadBlocks(ToBeDeletedBBs, nullptr); + STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted."); + BUILD_STAT_NAME(AAIsDead, BasicBlock) += ToBeDeletedBlocks.size(); } // Identify dead internal functions and delete them. This happens outside @@ -4651,22 +6136,33 @@ ChangeStatus Attributor::run(Module &M) { if (!F) continue; - const auto *LivenessAA = - lookupAAFor<AAIsDead>(IRPosition::function(*F)); - if (LivenessAA && - !checkForAllCallSites([](AbstractCallSite ACS) { return false; }, - *LivenessAA, true)) + if (!checkForAllCallSites( + [this](AbstractCallSite ACS) { + return ToBeDeletedFunctions.count( + ACS.getInstruction()->getFunction()); + }, + *F, true, nullptr)) continue; - STATS_TRACK(AAIsDead, Function); - F->replaceAllUsesWith(UndefValue::get(F->getType())); - F->eraseFromParent(); + ToBeDeletedFunctions.insert(F); InternalFns[u] = nullptr; FoundDeadFn = true; } } } + STATS_DECL(AAIsDead, Function, "Number of dead functions deleted."); + BUILD_STAT_NAME(AAIsDead, Function) += ToBeDeletedFunctions.size(); + + // Rewrite the functions as requested during manifest. + ManifestChange = ManifestChange | rewriteFunctionSignatures(); + + for (Function *Fn : ToBeDeletedFunctions) { + Fn->deleteBody(); + Fn->replaceAllUsesWith(UndefValue::get(Fn->getType())); + Fn->eraseFromParent(); + } + if (VerifyMaxFixpointIterations && IterationCounter != MaxFixpointIterations) { errs() << "\n[Attributor] Fixpoint iteration done after: " @@ -4679,6 +6175,252 @@ ChangeStatus Attributor::run(Module &M) { return ManifestChange; } +bool Attributor::registerFunctionSignatureRewrite( + Argument &Arg, ArrayRef<Type *> ReplacementTypes, + ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB, + ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) { + + auto CallSiteCanBeChanged = [](AbstractCallSite ACS) { + // Forbid must-tail calls for now. + return !ACS.isCallbackCall() && !ACS.getCallSite().isMustTailCall(); + }; + + Function *Fn = Arg.getParent(); + // Avoid var-arg functions for now. + if (Fn->isVarArg()) { + LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n"); + return false; + } + + // Avoid functions with complicated argument passing semantics. + AttributeList FnAttributeList = Fn->getAttributes(); + if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) || + FnAttributeList.hasAttrSomewhere(Attribute::StructRet) || + FnAttributeList.hasAttrSomewhere(Attribute::InAlloca)) { + LLVM_DEBUG( + dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n"); + return false; + } + + // Avoid callbacks for now. + if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr)) { + LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n"); + return false; + } + + auto InstPred = [](Instruction &I) { + if (auto *CI = dyn_cast<CallInst>(&I)) + return !CI->isMustTailCall(); + return true; + }; + + // Forbid must-tail calls for now. + // TODO: + bool AnyDead; + auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn); + if (!checkForAllInstructionsImpl(OpcodeInstMap, InstPred, nullptr, AnyDead, + {Instruction::Call})) { + LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n"); + return false; + } + + SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = ArgumentReplacementMap[Fn]; + if (ARIs.size() == 0) + ARIs.resize(Fn->arg_size()); + + // If we have a replacement already with less than or equal new arguments, + // ignore this request. + ArgumentReplacementInfo *&ARI = ARIs[Arg.getArgNo()]; + if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) { + LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n"); + return false; + } + + // If we have a replacement already but we like the new one better, delete + // the old. + if (ARI) + delete ARI; + + // Remember the replacement. + ARI = new ArgumentReplacementInfo(*this, Arg, ReplacementTypes, + std::move(CalleeRepairCB), + std::move(ACSRepairCB)); + + return true; +} + +ChangeStatus Attributor::rewriteFunctionSignatures() { + ChangeStatus Changed = ChangeStatus::UNCHANGED; + + for (auto &It : ArgumentReplacementMap) { + Function *OldFn = It.getFirst(); + + // Deleted functions do not require rewrites. + if (ToBeDeletedFunctions.count(OldFn)) + continue; + + const SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = It.getSecond(); + assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!"); + + SmallVector<Type *, 16> NewArgumentTypes; + SmallVector<AttributeSet, 16> NewArgumentAttributes; + + // Collect replacement argument types and copy over existing attributes. + AttributeList OldFnAttributeList = OldFn->getAttributes(); + for (Argument &Arg : OldFn->args()) { + if (ArgumentReplacementInfo *ARI = ARIs[Arg.getArgNo()]) { + NewArgumentTypes.append(ARI->ReplacementTypes.begin(), + ARI->ReplacementTypes.end()); + NewArgumentAttributes.append(ARI->getNumReplacementArgs(), + AttributeSet()); + } else { + NewArgumentTypes.push_back(Arg.getType()); + NewArgumentAttributes.push_back( + OldFnAttributeList.getParamAttributes(Arg.getArgNo())); + } + } + + FunctionType *OldFnTy = OldFn->getFunctionType(); + Type *RetTy = OldFnTy->getReturnType(); + + // Construct the new function type using the new arguments types. + FunctionType *NewFnTy = + FunctionType::get(RetTy, NewArgumentTypes, OldFnTy->isVarArg()); + + LLVM_DEBUG(dbgs() << "[Attributor] Function rewrite '" << OldFn->getName() + << "' from " << *OldFn->getFunctionType() << " to " + << *NewFnTy << "\n"); + + // Create the new function body and insert it into the module. + Function *NewFn = Function::Create(NewFnTy, OldFn->getLinkage(), + OldFn->getAddressSpace(), ""); + OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn); + NewFn->takeName(OldFn); + NewFn->copyAttributesFrom(OldFn); + + // Patch the pointer to LLVM function in debug info descriptor. + NewFn->setSubprogram(OldFn->getSubprogram()); + OldFn->setSubprogram(nullptr); + + // Recompute the parameter attributes list based on the new arguments for + // the function. + LLVMContext &Ctx = OldFn->getContext(); + NewFn->setAttributes(AttributeList::get( + Ctx, OldFnAttributeList.getFnAttributes(), + OldFnAttributeList.getRetAttributes(), NewArgumentAttributes)); + + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NewFn->getBasicBlockList().splice(NewFn->begin(), + OldFn->getBasicBlockList()); + + // Set of all "call-like" instructions that invoke the old function mapped + // to their new replacements. + SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs; + + // Callback to create a new "call-like" instruction for a given one. + auto CallSiteReplacementCreator = [&](AbstractCallSite ACS) { + CallBase *OldCB = cast<CallBase>(ACS.getInstruction()); + const AttributeList &OldCallAttributeList = OldCB->getAttributes(); + + // Collect the new argument operands for the replacement call site. + SmallVector<Value *, 16> NewArgOperands; + SmallVector<AttributeSet, 16> NewArgOperandAttributes; + for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) { + unsigned NewFirstArgNum = NewArgOperands.size(); + (void)NewFirstArgNum; // only used inside assert. + if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) { + if (ARI->ACSRepairCB) + ARI->ACSRepairCB(*ARI, ACS, NewArgOperands); + assert(ARI->getNumReplacementArgs() + NewFirstArgNum == + NewArgOperands.size() && + "ACS repair callback did not provide as many operand as new " + "types were registered!"); + // TODO: Exose the attribute set to the ACS repair callback + NewArgOperandAttributes.append(ARI->ReplacementTypes.size(), + AttributeSet()); + } else { + NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum)); + NewArgOperandAttributes.push_back( + OldCallAttributeList.getParamAttributes(OldArgNum)); + } + } + + assert(NewArgOperands.size() == NewArgOperandAttributes.size() && + "Mismatch # argument operands vs. # argument operand attributes!"); + assert(NewArgOperands.size() == NewFn->arg_size() && + "Mismatch # argument operands vs. # function arguments!"); + + SmallVector<OperandBundleDef, 4> OperandBundleDefs; + OldCB->getOperandBundlesAsDefs(OperandBundleDefs); + + // Create a new call or invoke instruction to replace the old one. + CallBase *NewCB; + if (InvokeInst *II = dyn_cast<InvokeInst>(OldCB)) { + NewCB = + InvokeInst::Create(NewFn, II->getNormalDest(), II->getUnwindDest(), + NewArgOperands, OperandBundleDefs, "", OldCB); + } else { + auto *NewCI = CallInst::Create(NewFn, NewArgOperands, OperandBundleDefs, + "", OldCB); + NewCI->setTailCallKind(cast<CallInst>(OldCB)->getTailCallKind()); + NewCB = NewCI; + } + + // Copy over various properties and the new attributes. + uint64_t W; + if (OldCB->extractProfTotalWeight(W)) + NewCB->setProfWeight(W); + NewCB->setCallingConv(OldCB->getCallingConv()); + NewCB->setDebugLoc(OldCB->getDebugLoc()); + NewCB->takeName(OldCB); + NewCB->setAttributes(AttributeList::get( + Ctx, OldCallAttributeList.getFnAttributes(), + OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes)); + + CallSitePairs.push_back({OldCB, NewCB}); + return true; + }; + + // Use the CallSiteReplacementCreator to create replacement call sites. + bool Success = + checkForAllCallSites(CallSiteReplacementCreator, *OldFn, true, nullptr); + (void)Success; + assert(Success && "Assumed call site replacement to succeed!"); + + // Rewire the arguments. + auto OldFnArgIt = OldFn->arg_begin(); + auto NewFnArgIt = NewFn->arg_begin(); + for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); + ++OldArgNum, ++OldFnArgIt) { + if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) { + if (ARI->CalleeRepairCB) + ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt); + NewFnArgIt += ARI->ReplacementTypes.size(); + } else { + NewFnArgIt->takeName(&*OldFnArgIt); + OldFnArgIt->replaceAllUsesWith(&*NewFnArgIt); + ++NewFnArgIt; + } + } + + // Eliminate the instructions *after* we visited all of them. + for (auto &CallSitePair : CallSitePairs) { + CallBase &OldCB = *CallSitePair.first; + CallBase &NewCB = *CallSitePair.second; + OldCB.replaceAllUsesWith(&NewCB); + OldCB.eraseFromParent(); + } + + ToBeDeletedFunctions.insert(OldFn); + + Changed = ChangeStatus::CHANGED; + } + + return Changed; +} + void Attributor::initializeInformationCache(Function &F) { // Walk all instructions to find interesting instructions that might be @@ -4710,6 +6452,9 @@ void Attributor::initializeInformationCache(Function &F) { case Instruction::Invoke: case Instruction::CleanupRet: case Instruction::CatchSwitch: + case Instruction::AtomicRMW: + case Instruction::AtomicCmpXchg: + case Instruction::Br: case Instruction::Resume: case Instruction::Ret: IsInterestingOpcode = true; @@ -4721,9 +6466,26 @@ void Attributor::initializeInformationCache(Function &F) { } } +void Attributor::recordDependence(const AbstractAttribute &FromAA, + const AbstractAttribute &ToAA, + DepClassTy DepClass) { + if (FromAA.getState().isAtFixpoint()) + return; + + if (DepClass == DepClassTy::REQUIRED) + QueryMap[&FromAA].RequiredAAs.insert( + const_cast<AbstractAttribute *>(&ToAA)); + else + QueryMap[&FromAA].OptionalAAs.insert( + const_cast<AbstractAttribute *>(&ToAA)); + QueriedNonFixAA = true; +} + void Attributor::identifyDefaultAbstractAttributes(Function &F) { if (!VisitedFunctions.insert(&F).second) return; + if (F.isDeclaration()) + return; IRPosition FPos = IRPosition::function(F); @@ -4735,6 +6497,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { // Every function might be "will-return". getOrCreateAAFor<AAWillReturn>(FPos); + // Every function might contain instructions that cause "undefined behavior". + getOrCreateAAFor<AAUndefinedBehavior>(FPos); + // Every function can be nounwind. getOrCreateAAFor<AANoUnwind>(FPos); @@ -4766,6 +6531,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { IRPosition RetPos = IRPosition::returned(F); + // Every returned value might be dead. + getOrCreateAAFor<AAIsDead>(RetPos); + // Every function might be simplified. getOrCreateAAFor<AAValueSimplify>(RetPos); @@ -4811,16 +6579,41 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { // Every argument with pointer type might be marked // "readnone/readonly/writeonly/..." getOrCreateAAFor<AAMemoryBehavior>(ArgPos); + + // Every argument with pointer type might be marked nofree. + getOrCreateAAFor<AANoFree>(ArgPos); } } auto CallSitePred = [&](Instruction &I) -> bool { CallSite CS(&I); - if (CS.getCalledFunction()) { - for (int i = 0, e = CS.getCalledFunction()->arg_size(); i < e; i++) { + if (Function *Callee = CS.getCalledFunction()) { + // Skip declerations except if annotations on their call sites were + // explicitly requested. + if (!AnnotateDeclarationCallSites && Callee->isDeclaration() && + !Callee->hasMetadata(LLVMContext::MD_callback)) + return true; + + if (!Callee->getReturnType()->isVoidTy() && !CS->use_empty()) { + + IRPosition CSRetPos = IRPosition::callsite_returned(CS); + + // Call site return values might be dead. + getOrCreateAAFor<AAIsDead>(CSRetPos); + + // Call site return integer values might be limited by a constant range. + if (Callee->getReturnType()->isIntegerTy()) { + getOrCreateAAFor<AAValueConstantRange>(CSRetPos); + } + } + + for (int i = 0, e = CS.getNumArgOperands(); i < e; i++) { IRPosition CSArgPos = IRPosition::callsite_argument(CS, i); + // Every call site argument might be dead. + getOrCreateAAFor<AAIsDead>(CSArgPos); + // Call site argument might be simplified. getOrCreateAAFor<AAValueSimplify>(CSArgPos); @@ -4838,6 +6631,13 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) { // Call site argument attribute "align". getOrCreateAAFor<AAAlign>(CSArgPos); + + // Call site argument attribute + // "readnone/readonly/writeonly/..." + getOrCreateAAFor<AAMemoryBehavior>(CSArgPos); + + // Call site argument attribute "nofree". + getOrCreateAAFor<AANoFree>(CSArgPos); } } return true; @@ -4903,11 +6703,24 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) { << Pos.getAnchorValue().getName() << "@" << Pos.getArgNo() << "]}"; } -raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerState &S) { +template <typename base_ty, base_ty BestState, base_ty WorstState> +raw_ostream & +llvm::operator<<(raw_ostream &OS, + const IntegerStateBase<base_ty, BestState, WorstState> &S) { return OS << "(" << S.getKnown() << "-" << S.getAssumed() << ")" << static_cast<const AbstractState &>(S); } +raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) { + OS << "range-state(" << S.getBitWidth() << ")<"; + S.getKnown().print(OS); + OS << " / "; + S.getAssumed().print(OS); + OS << ">"; + + return OS << static_cast<const AbstractState &>(S); +} + raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) { return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : "")); } @@ -4963,7 +6776,9 @@ static bool runAttributorOnModule(Module &M, AnalysisGetter &AG) { A.identifyDefaultAbstractAttributes(F); } - return A.run(M) == ChangeStatus::CHANGED; + bool Changed = A.run(M) == ChangeStatus::CHANGED; + assert(!verifyModule(M, &errs()) && "Module verification failed!"); + return Changed; } PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) { @@ -5011,7 +6826,9 @@ const char AANoFree::ID = 0; const char AANonNull::ID = 0; const char AANoRecurse::ID = 0; const char AAWillReturn::ID = 0; +const char AAUndefinedBehavior::ID = 0; const char AANoAlias::ID = 0; +const char AAReachability::ID = 0; const char AANoReturn::ID = 0; const char AAIsDead::ID = 0; const char AADereferenceable::ID = 0; @@ -5020,6 +6837,7 @@ const char AANoCapture::ID = 0; const char AAValueSimplify::ID = 0; const char AAHeapToStack::ID = 0; const char AAMemoryBehavior::ID = 0; +const char AAValueConstantRange::ID = 0; // Macro magic to create the static generator function for attributes that // follow the naming scheme. @@ -5115,11 +6933,9 @@ const char AAMemoryBehavior::ID = 0; CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync) -CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn) -CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead) CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues) CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull) @@ -5127,10 +6943,15 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias) CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable) CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign) CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture) +CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange) CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify) +CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead) +CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree) CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack) +CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability) +CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior) CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp index 6b68aa90c567..b49a92ad16b3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp @@ -17,6 +17,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp index de80c88c1591..aec470ffadc4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp index 20cb3213628e..f28a399b1779 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp @@ -21,6 +21,8 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp index 3cf839e397f8..ea1278aa108f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/IPO.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index e20159ba0db5..2fe9a59ad210 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 968a13110b16..61d519d8ae88 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp index fc52db562c62..7f138d206fac 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/GlobalStatus.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp index b38cb6d0ed3f..2cb184e8d4f4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -11,6 +11,8 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index b174c63a577b..b6d0b2e35694 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -48,6 +48,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp index 3f5cc078d75f..be0446a946ec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IRReader/IRReader.h" +#include "llvm/InitializePasses.h" #include "llvm/Linker/IRMover.h" #include "llvm/Object/ModuleSymbolTable.h" #include "llvm/Object/SymbolicFile.h" @@ -231,7 +232,8 @@ selectCallee(const ModuleSummaryIndex &Index, return false; } - if (Summary->instCount() > Threshold) { + if ((Summary->instCount() > Threshold) && + !Summary->fflags().AlwaysInline) { Reason = FunctionImporter::ImportFailureReason::TooLarge; return false; } @@ -280,7 +282,8 @@ updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) { } static void computeImportForReferencedGlobals( - const FunctionSummary &Summary, const GVSummaryMapTy &DefinedGVSummaries, + const FunctionSummary &Summary, const ModuleSummaryIndex &Index, + const GVSummaryMapTy &DefinedGVSummaries, FunctionImporter::ImportMapTy &ImportList, StringMap<FunctionImporter::ExportSetTy> *ExportLists) { for (auto &VI : Summary.refs()) { @@ -303,16 +306,28 @@ static void computeImportForReferencedGlobals( RefSummary->modulePath() != Summary.modulePath(); }; + auto MarkExported = [&](const ValueInfo &VI, const GlobalValueSummary *S) { + if (ExportLists) + (*ExportLists)[S->modulePath()].insert(VI); + }; + for (auto &RefSummary : VI.getSummaryList()) if (isa<GlobalVarSummary>(RefSummary.get()) && - canImportGlobalVar(RefSummary.get()) && + Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) && !LocalNotInModule(RefSummary.get())) { auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID()); // Only update stat if we haven't already imported this variable. if (ILI.second) NumImportedGlobalVarsThinLink++; - if (ExportLists) - (*ExportLists)[RefSummary->modulePath()].insert(VI.getGUID()); + MarkExported(VI, RefSummary.get()); + // Promote referenced functions and variables. We don't promote + // objects referenced by writeonly variable initializer, because + // we convert such variables initializers to "zeroinitializer". + // See processGlobalForThinLTO. + if (!Index.isWriteOnly(cast<GlobalVarSummary>(RefSummary.get()))) + for (const auto &VI : RefSummary->refs()) + for (const auto &RefFn : VI.getSummaryList()) + MarkExported(VI, RefFn.get()); break; } } @@ -351,8 +366,8 @@ static void computeImportForFunction( FunctionImporter::ImportMapTy &ImportList, StringMap<FunctionImporter::ExportSetTy> *ExportLists, FunctionImporter::ImportThresholdsTy &ImportThresholds) { - computeImportForReferencedGlobals(Summary, DefinedGVSummaries, ImportList, - ExportLists); + computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries, + ImportList, ExportLists); static int ImportCount = 0; for (auto &Edge : Summary.calls()) { ValueInfo VI = Edge.first; @@ -462,7 +477,8 @@ static void computeImportForFunction( CalleeSummary = CalleeSummary->getBaseObject(); ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary); - assert(ResolvedCalleeSummary->instCount() <= NewThreshold && + assert((ResolvedCalleeSummary->fflags().AlwaysInline || + (ResolvedCalleeSummary->instCount() <= NewThreshold)) && "selectCallee() didn't honor the threshold"); auto ExportModulePath = ResolvedCalleeSummary->modulePath(); @@ -481,7 +497,7 @@ static void computeImportForFunction( // Make exports in the source module. if (ExportLists) { auto &ExportList = (*ExportLists)[ExportModulePath]; - ExportList.insert(VI.getGUID()); + ExportList.insert(VI); if (!PreviouslyImported) { // This is the first time this function was exported from its source // module, so mark all functions and globals it references as exported @@ -489,14 +505,11 @@ static void computeImportForFunction( // For efficiency, we unconditionally add all the referenced GUIDs // to the ExportList for this module, and will prune out any not // defined in the module later in a single pass. - for (auto &Edge : ResolvedCalleeSummary->calls()) { - auto CalleeGUID = Edge.first.getGUID(); - ExportList.insert(CalleeGUID); - } - for (auto &Ref : ResolvedCalleeSummary->refs()) { - auto GUID = Ref.getGUID(); - ExportList.insert(GUID); - } + for (auto &Edge : ResolvedCalleeSummary->calls()) + ExportList.insert(Edge.first); + + for (auto &Ref : ResolvedCalleeSummary->refs()) + ExportList.insert(Ref); } } } @@ -591,29 +604,64 @@ static void ComputeImportForModule( } #ifndef NDEBUG +static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, ValueInfo VI) { + auto SL = VI.getSummaryList(); + return SL.empty() + ? false + : SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind; +} + static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, GlobalValue::GUID G) { - if (const auto &VI = Index.getValueInfo(G)) { - auto SL = VI.getSummaryList(); - if (!SL.empty()) - return SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind; - } + if (const auto &VI = Index.getValueInfo(G)) + return isGlobalVarSummary(Index, VI); return false; } -static GlobalValue::GUID getGUID(GlobalValue::GUID G) { return G; } - template <class T> static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index, T &Cont) { unsigned NumGVS = 0; for (auto &V : Cont) - if (isGlobalVarSummary(Index, getGUID(V))) + if (isGlobalVarSummary(Index, V)) ++NumGVS; return NumGVS; } #endif +#ifndef NDEBUG +static bool +checkVariableImport(const ModuleSummaryIndex &Index, + StringMap<FunctionImporter::ImportMapTy> &ImportLists, + StringMap<FunctionImporter::ExportSetTy> &ExportLists) { + + DenseSet<GlobalValue::GUID> FlattenedImports; + + for (auto &ImportPerModule : ImportLists) + for (auto &ExportPerModule : ImportPerModule.second) + FlattenedImports.insert(ExportPerModule.second.begin(), + ExportPerModule.second.end()); + + // Checks that all GUIDs of read/writeonly vars we see in export lists + // are also in the import lists. Otherwise we my face linker undefs, + // because readonly and writeonly vars are internalized in their + // source modules. + auto IsReadOrWriteOnlyVar = [&](StringRef ModulePath, const ValueInfo &VI) { + auto *GVS = dyn_cast_or_null<GlobalVarSummary>( + Index.findSummaryInModule(VI, ModulePath)); + return GVS && (Index.isReadOnly(GVS) || Index.isWriteOnly(GVS)); + }; + + for (auto &ExportPerModule : ExportLists) + for (auto &VI : ExportPerModule.second) + if (!FlattenedImports.count(VI.getGUID()) && + IsReadOrWriteOnlyVar(ExportPerModule.first(), VI)) + return false; + + return true; +} +#endif + /// Compute all the import and export for every module using the Index. void llvm::ComputeCrossModuleImport( const ModuleSummaryIndex &Index, @@ -639,13 +687,14 @@ void llvm::ComputeCrossModuleImport( const auto &DefinedGVSummaries = ModuleToDefinedGVSummaries.lookup(ELI.first()); for (auto EI = ELI.second.begin(); EI != ELI.second.end();) { - if (!DefinedGVSummaries.count(*EI)) - EI = ELI.second.erase(EI); + if (!DefinedGVSummaries.count(EI->getGUID())) + ELI.second.erase(EI++); else ++EI; } } + assert(checkVariableImport(Index, ImportLists, ExportLists)); #ifndef NDEBUG LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size() << " modules:\n"); @@ -852,18 +901,8 @@ void llvm::computeDeadSymbolsWithConstProp( function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing, bool ImportEnabled) { computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing); - if (ImportEnabled) { + if (ImportEnabled) Index.propagateAttributes(GUIDPreservedSymbols); - } else { - // If import is disabled we should drop read/write-only attribute - // from all summaries to prevent internalization. - for (auto &P : Index) - for (auto &S : P.second.SummaryList) - if (auto *GVS = dyn_cast<GlobalVarSummary>(S.get())) { - GVS->setReadOnly(false); - GVS->setWriteOnly(false); - } - } } /// Compute the set of summaries needed for a ThinLTO backend compilation of diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp index f010f7b703a6..72b8d7522f04 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -22,7 +22,9 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 819715b9f8da..0fd966457ece 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -25,7 +25,6 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -53,6 +52,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" @@ -65,6 +65,7 @@ #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/Evaluator.h" #include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <utility> @@ -432,6 +433,20 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) { return true; } +static bool CanDoGlobalSRA(GlobalVariable *GV) { + Constant *Init = GV->getInitializer(); + + if (isa<StructType>(Init->getType())) { + // nothing to check + } else if (SequentialType *STy = dyn_cast<SequentialType>(Init->getType())) { + if (STy->getNumElements() > 16 && GV->hasNUsesOrMore(16)) + return false; // It's not worth it. + } else + return false; + + return GlobalUsersSafeToSRA(GV); +} + /// Copy over the debug info for a variable to its SRA replacements. static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, uint64_t FragmentOffsetInBits, @@ -461,88 +476,94 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, /// insert so that the caller can reprocess it. static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Make sure this global only has simple uses that we can SRA. - if (!GlobalUsersSafeToSRA(GV)) + if (!CanDoGlobalSRA(GV)) return nullptr; assert(GV->hasLocalLinkage()); Constant *Init = GV->getInitializer(); Type *Ty = Init->getType(); - std::vector<GlobalVariable *> NewGlobals; - Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); + std::map<unsigned, GlobalVariable *> NewGlobals; // Get the alignment of the global, either explicit or target-specific. unsigned StartAlignment = GV->getAlignment(); if (StartAlignment == 0) StartAlignment = DL.getABITypeAlignment(GV->getType()); - if (StructType *STy = dyn_cast<StructType>(Ty)) { - unsigned NumElements = STy->getNumElements(); - NewGlobals.reserve(NumElements); - const StructLayout &Layout = *DL.getStructLayout(STy); - for (unsigned i = 0, e = NumElements; i != e; ++i) { - Constant *In = Init->getAggregateElement(i); - assert(In && "Couldn't get element of initializer?"); - GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false, - GlobalVariable::InternalLinkage, - In, GV->getName()+"."+Twine(i), - GV->getThreadLocalMode(), - GV->getType()->getAddressSpace()); - NGV->setExternallyInitialized(GV->isExternallyInitialized()); - NGV->copyAttributesFrom(GV); - Globals.push_back(NGV); - NewGlobals.push_back(NGV); + // Loop over all users and create replacement variables for used aggregate + // elements. + for (User *GEP : GV->users()) { + assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() == + Instruction::GetElementPtr) || + isa<GetElementPtrInst>(GEP)) && + "NonGEP CE's are not SRAable!"); + + // Ignore the 1th operand, which has to be zero or else the program is quite + // broken (undefined). Get the 2nd operand, which is the structure or array + // index. + unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); + if (NewGlobals.count(ElementIdx) == 1) + continue; // we`ve already created replacement variable + assert(NewGlobals.count(ElementIdx) == 0); + + Type *ElTy = nullptr; + if (StructType *STy = dyn_cast<StructType>(Ty)) + ElTy = STy->getElementType(ElementIdx); + else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) + ElTy = STy->getElementType(); + assert(ElTy); + + Constant *In = Init->getAggregateElement(ElementIdx); + assert(In && "Couldn't get element of initializer?"); + + GlobalVariable *NGV = new GlobalVariable( + ElTy, false, GlobalVariable::InternalLinkage, In, + GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(), + GV->getType()->getAddressSpace()); + NGV->setExternallyInitialized(GV->isExternallyInitialized()); + NGV->copyAttributesFrom(GV); + NewGlobals.insert(std::make_pair(ElementIdx, NGV)); + + if (StructType *STy = dyn_cast<StructType>(Ty)) { + const StructLayout &Layout = *DL.getStructLayout(STy); // Calculate the known alignment of the field. If the original aggregate // had 256 byte alignment for example, something might depend on that: // propagate info to each field. - uint64_t FieldOffset = Layout.getElementOffset(i); + uint64_t FieldOffset = Layout.getElementOffset(ElementIdx); Align NewAlign(MinAlign(StartAlignment, FieldOffset)); - if (NewAlign > Align(DL.getABITypeAlignment(STy->getElementType(i)))) + if (NewAlign > + Align(DL.getABITypeAlignment(STy->getElementType(ElementIdx)))) NGV->setAlignment(NewAlign); // Copy over the debug info for the variable. uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType()); - uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(i); - transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, NumElements); - } - } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) { - unsigned NumElements = STy->getNumElements(); - if (NumElements > 16 && GV->hasNUsesOrMore(16)) - return nullptr; // It's not worth it. - NewGlobals.reserve(NumElements); - auto ElTy = STy->getElementType(); - uint64_t EltSize = DL.getTypeAllocSize(ElTy); - Align EltAlign(DL.getABITypeAlignment(ElTy)); - uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy); - for (unsigned i = 0, e = NumElements; i != e; ++i) { - Constant *In = Init->getAggregateElement(i); - assert(In && "Couldn't get element of initializer?"); - - GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false, - GlobalVariable::InternalLinkage, - In, GV->getName()+"."+Twine(i), - GV->getThreadLocalMode(), - GV->getType()->getAddressSpace()); - NGV->setExternallyInitialized(GV->isExternallyInitialized()); - NGV->copyAttributesFrom(GV); - Globals.push_back(NGV); - NewGlobals.push_back(NGV); + uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx); + transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, + STy->getNumElements()); + } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) { + uint64_t EltSize = DL.getTypeAllocSize(ElTy); + Align EltAlign(DL.getABITypeAlignment(ElTy)); + uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy); // Calculate the known alignment of the field. If the original aggregate // had 256 byte alignment for example, something might depend on that: // propagate info to each field. - Align NewAlign(MinAlign(StartAlignment, EltSize * i)); + Align NewAlign(MinAlign(StartAlignment, EltSize * ElementIdx)); if (NewAlign > EltAlign) NGV->setAlignment(NewAlign); - transferSRADebugInfo(GV, NGV, FragmentSizeInBits * i, FragmentSizeInBits, - NumElements); + transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx, + FragmentSizeInBits, STy->getNumElements()); } } if (NewGlobals.empty()) return nullptr; + Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); + for (auto NewGlobalVar : NewGlobals) + Globals.push_back(NewGlobalVar.second); + LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); @@ -558,11 +579,11 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Ignore the 1th operand, which has to be zero or else the program is quite // broken (undefined). Get the 2nd operand, which is the structure or array // index. - unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); - if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access. + unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); + assert(NewGlobals.count(ElementIdx) == 1); - Value *NewPtr = NewGlobals[Val]; - Type *NewTy = NewGlobals[Val]->getValueType(); + Value *NewPtr = NewGlobals[ElementIdx]; + Type *NewTy = NewGlobals[ElementIdx]->getValueType(); // Form a shorter GEP if needed. if (GEP->getNumOperands() > 3) { @@ -580,7 +601,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) Idxs.push_back(GEPI->getOperand(i)); NewPtr = GetElementPtrInst::Create( - NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(Val), GEPI); + NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx), + GEPI); } } GEP->replaceAllUsesWith(NewPtr); @@ -595,17 +617,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { Globals.erase(GV); ++NumSRA; - // Loop over the new globals array deleting any globals that are obviously - // dead. This can arise due to scalarization of a structure or an array that - // has elements that are dead. - unsigned FirstGlobal = 0; - for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i) - if (NewGlobals[i]->use_empty()) { - Globals.erase(NewGlobals[i]); - if (FirstGlobal == i) ++FirstGlobal; - } - - return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr; + assert(NewGlobals.size() > 0); + return NewGlobals.begin()->second; } /// Return true if all users of the specified value will trap if the value is @@ -2285,10 +2298,14 @@ OptimizeFunctions(Module &M, // So, remove unreachable blocks from the function, because a) there's // no point in analyzing them and b) GlobalOpt should otherwise grow // some more complicated logic to break these cycles. + // Removing unreachable blocks might invalidate the dominator so we + // recalculate it. if (!F->isDeclaration()) { - auto &DT = LookupDomTree(*F); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - Changed |= removeUnreachableBlocks(*F, &DTU); + if (removeUnreachableBlocks(*F)) { + auto &DT = LookupDomTree(*F); + DT.recalculate(*F); + Changed = true; + } } Changed |= processGlobal(*F, GetTLI, LookupDomTree); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp index 060043a40b89..4a319ead23c0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/IPO.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index cfdcc8db7f50..5e690714bfdf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -25,6 +25,7 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/HotColdSplitting.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -53,13 +54,14 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/IPO/HotColdSplitting.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -205,6 +207,11 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { if (F.hasFnAttribute(Attribute::NoInline)) return false; + // A function marked `noreturn` may contain unreachable terminators: these + // should not be considered cold, as the function may be a trampoline. + if (F.hasFnAttribute(Attribute::NoReturn)) + return false; + if (F.hasFnAttribute(Attribute::SanitizeAddress) || F.hasFnAttribute(Attribute::SanitizeHWAddress) || F.hasFnAttribute(Attribute::SanitizeThread) || @@ -326,6 +333,9 @@ Function *HotColdSplitting::extractColdRegion( } CI->setIsNoInline(); + if (OrigF->hasSection()) + OutF->setSection(OrigF->getSection()); + markFunctionCold(*OutF, BFI != nullptr); LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp index 7dc4d9ee9e34..1bda13a9bdd8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" using namespace llvm; @@ -254,7 +255,7 @@ static bool PropagateConstantReturn(Function &F) { // Find the index of the retval to replace with int index = -1; if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins)) - if (EV->hasIndices()) + if (EV->getNumIndices() == 1) index = *EV->idx_begin(); // If this use uses a specific return value, and we have a replacement, diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp index bddf75211599..8a15800cbdb5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp @@ -43,7 +43,7 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeBlockExtractorPass(Registry); initializeSingleLoopExtractorPass(Registry); initializeLowerTypeTestsPass(Registry); - initializeMergeFunctionsPass(Registry); + initializeMergeFunctionsLegacyPassPass(Registry); initializePartialInlinerLegacyPassPass(Registry); initializeAttributorLegacyPassPass(Registry); initializePostOrderFunctionAttrsLegacyPassPass(Registry); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp index d1a68b28bd33..685f8f7d7a00 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -11,6 +11,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp index efb71b73cbb7..e818743544e6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Inliner.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp index 2e269604e379..e1644819af61 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/StringSet.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp index add2ae053735..f7108e8002ac 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 2dec366d70e2..fa664966faf7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" @@ -1810,7 +1811,7 @@ bool LowerTypeTestsModule::lower() { // reference them. This is used to partition the set of type identifiers in // the module into disjoint sets. using GlobalClassesTy = EquivalenceClasses< - PointerUnion3<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>; + PointerUnion<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>; GlobalClassesTy GlobalClasses; // Verify the type metadata and build a few data structures to let us diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 8b9abaddc84c..06d2a2f31941 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -115,12 +115,14 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/MergeFunctions.h" #include "llvm/Transforms/Utils/FunctionComparator.h" #include <algorithm> #include <cassert> @@ -195,16 +197,12 @@ public: /// by considering all pointer types to be equivalent. Once identified, /// MergeFunctions will fold them by replacing a call to one to a call to a /// bitcast of the other. -class MergeFunctions : public ModulePass { +class MergeFunctions { public: - static char ID; - - MergeFunctions() - : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)) { - initializeMergeFunctionsPass(*PassRegistry::getPassRegistry()); + MergeFunctions() : FnTree(FunctionNodeCmp(&GlobalNumbers)) { } - bool runOnModule(Module &M) override; + bool runOnModule(Module &M); private: // The function comparison operator is provided here so that FunctionNodes do @@ -297,14 +295,39 @@ private: DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree; }; -} // end anonymous namespace +class MergeFunctionsLegacyPass : public ModulePass { +public: + static char ID; + + MergeFunctionsLegacyPass(): ModulePass(ID) { + initializeMergeFunctionsLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; -char MergeFunctions::ID = 0; + MergeFunctions MF; + return MF.runOnModule(M); + } +}; -INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false) +} // end anonymous namespace + +char MergeFunctionsLegacyPass::ID = 0; +INITIALIZE_PASS(MergeFunctionsLegacyPass, "mergefunc", + "Merge Functions", false, false) ModulePass *llvm::createMergeFunctionsPass() { - return new MergeFunctions(); + return new MergeFunctionsLegacyPass(); +} + +PreservedAnalyses MergeFunctionsPass::run(Module &M, + ModuleAnalysisManager &AM) { + MergeFunctions MF; + if (!MF.runOnModule(M)) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); } #ifndef NDEBUG @@ -386,9 +409,6 @@ static bool isEligibleForMerging(Function &F) { } bool MergeFunctions::runOnModule(Module &M) { - if (skipModule(M)) - return false; - bool Changed = false; // All functions in the module, ordered by hash. Functions with a unique @@ -449,28 +469,10 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { ++UI; CallSite CS(U->getUser()); if (CS && CS.isCallee(U)) { - // Transfer the called function's attributes to the call site. Due to the - // bitcast we will 'lose' ABI changing attributes because the 'called - // function' is no longer a Function* but the bitcast. Code that looks up - // the attributes from the called function will fail. - - // FIXME: This is not actually true, at least not anymore. The callsite - // will always have the same ABI affecting attributes as the callee, - // because otherwise the original input has UB. Note that Old and New - // always have matching ABI, so no attributes need to be changed. - // Transferring other attributes may help other optimizations, but that - // should be done uniformly and not in this ad-hoc way. - auto &Context = New->getContext(); - auto NewPAL = New->getAttributes(); - SmallVector<AttributeSet, 4> NewArgAttrs; - for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) - NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx)); - // Don't transfer attributes from the function to the callee. Function - // attributes typically aren't relevant to the calling convention or ABI. - CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(), - NewPAL.getRetAttributes(), - NewArgAttrs)); - + // Do not copy attributes from the called function to the call-site. + // Function comparison ensures that the attributes are the same up to + // type congruences in byval(), in which case we need to keep the byval + // type of the call-site, not the callee function. remove(CS.getInstruction()->getFunction()); U->set(BitcastNew); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp index e193074884af..cd3701e90308 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/User.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" @@ -701,7 +702,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { return OutliningInfo; } -// Check if there is PGO data or user annoated branch data: +// Check if there is PGO data or user annotated branch data: static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { if (F->hasProfileData()) return true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 5314a8219b1e..9c992830879a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -147,6 +147,10 @@ cl::opt<bool> EnableOrderFileInstrumentation( "enable-order-file-instrumentation", cl::init(false), cl::Hidden, cl::desc("Enable order file instrumentation (default = off)")); +static cl::opt<bool> + EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, + cl::desc("Enable lowering of the matrix intrinsics")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -320,19 +324,26 @@ void PassManagerBuilder::addFunctionSimplificationPasses( legacy::PassManagerBase &MPM) { // Start of function pass. // Break up aggregate allocas, using SSAUpdater. + assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!"); MPM.add(createSROAPass()); MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies - if (EnableGVNHoist) - MPM.add(createGVNHoistPass()); - if (EnableGVNSink) { - MPM.add(createGVNSinkPass()); - MPM.add(createCFGSimplificationPass()); + + if (OptLevel > 1) { + if (EnableGVNHoist) + MPM.add(createGVNHoistPass()); + if (EnableGVNSink) { + MPM.add(createGVNSinkPass()); + MPM.add(createCFGSimplificationPass()); + } } - // Speculative execution if the target has divergent branches; otherwise nop. - MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); - MPM.add(createJumpThreadingPass()); // Thread jumps. - MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals + if (OptLevel > 1) { + // Speculative execution if the target has divergent branches; otherwise nop. + MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); + + MPM.add(createJumpThreadingPass()); // Thread jumps. + MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals + } MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Combine silly seq's if (OptLevel > 2) @@ -346,8 +357,10 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (SizeLevel == 0) MPM.add(createPGOMemOPSizeOptLegacyPass()); - MPM.add(createTailCallEliminationPass()); // Eliminate tail calls - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + // TODO: Investigate the cost/benefit of tail call elimination on debugging. + if (OptLevel > 1) + MPM.add(createTailCallEliminationPass()); // Eliminate tail calls + MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions // Begin the loop pass pipeline. @@ -360,6 +373,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( } // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + // TODO: Investigate promotion cap for O1. MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); if (EnableSimpleLoopUnswitch) MPM.add(createSimpleLoopUnswitchLegacyPass()); @@ -402,16 +416,19 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // opened up by them. addInstructionCombiningPass(MPM); addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createJumpThreadingPass()); // Thread jumps - MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createDeadStoreEliminationPass()); // Delete dead stores - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + if (OptLevel > 1) { + MPM.add(createJumpThreadingPass()); // Thread jumps + MPM.add(createCorrelatedValuePropagationPass()); + MPM.add(createDeadStoreEliminationPass()); // Delete dead stores + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + } addExtensionsToPM(EP_ScalarOptimizerLate, MPM); if (RerollLoops) MPM.add(createLoopRerollPass()); + // TODO: Investigate if this is too expensive at O1. MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Clean up after everything. @@ -656,6 +673,14 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createFloat2IntPass()); MPM.add(createLowerConstantIntrinsicsPass()); + if (EnableMatrix) { + MPM.add(createLowerMatrixIntrinsicsPass()); + // CSE the pointer arithmetic of the column vectors. This allows alias + // analysis to establish no-aliasing between loads and stores of different + // columns of the same matrix. + MPM.add(createEarlyCSEPass(false)); + } + addExtensionsToPM(EP_VectorizerStart, MPM); // Re-rotate loops in all our loop nests. These may have fallout out of @@ -899,7 +924,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // LTO provides additional opportunities for tailcall elimination due to // link-time inlining, and visibility of nocapture attribute. - PM.add(createTailCallEliminationPass()); + if (OptLevel > 1) + PM.add(createTailCallEliminationPass()); // Infer attributes on declarations, call sites, arguments, etc. PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture. diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp index cb3915dfb678..45a0ce20eb17 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -18,15 +18,16 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp index 307690729b14..fdffffba0c2d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp @@ -2,6 +2,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar/SCCP.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp index 6184681db8a2..a1fbb1adc412 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -26,13 +26,17 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -57,6 +61,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/SampleProf.h" @@ -90,6 +95,12 @@ using namespace llvm; using namespace sampleprof; using ProfileCount = Function::ProfileCount; #define DEBUG_TYPE "sample-profile" +#define CSINLINE_DEBUG DEBUG_TYPE "-inline" + +STATISTIC(NumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(NumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); // Command line option to specify the file to read samples from. This is // mainly used for debugging. @@ -136,6 +147,25 @@ static cl::opt<bool> ProfileAccurateForSymsInList( cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. ")); +static cl::opt<bool> ProfileMergeInlinee( + "sample-profile-merge-inlinee", cl::Hidden, cl::init(false), + cl::desc("Merge past inlinee's profile to outline version if sample " + "profile loader decided not to inline a call site.")); + +static cl::opt<bool> ProfileTopDownLoad( + "sample-profile-top-down-load", cl::Hidden, cl::init(false), + cl::desc("Do profile annotation and inlining for functions in top-down " + "order of call graph during sample profile loading.")); + +static cl::opt<bool> ProfileSizeInline( + "sample-profile-inline-size", cl::Hidden, cl::init(false), + cl::desc("Inline cold call sites in profile loader if it's beneficial " + "for code size.")); + +static cl::opt<int> SampleColdCallSiteThreshold( + "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), + cl::desc("Threshold for inlining cold callsites")); + namespace { using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; @@ -285,7 +315,7 @@ public: bool doInitialization(Module &M); bool runOnModule(Module &M, ModuleAnalysisManager *AM, - ProfileSummaryInfo *_PSI); + ProfileSummaryInfo *_PSI, CallGraph *CG); void dump() { Reader->dump(); } @@ -305,6 +335,10 @@ protected: bool inlineCallInstruction(Instruction *I); bool inlineHotFunctions(Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs); + // Inline cold/small functions in addition to hot ones + bool shouldInlineColdCallee(Instruction &CallInst); + void emitOptimizationRemarksForInlineCandidates( + const SmallVector<Instruction *, 10> &Candidates, const Function &F, bool Hot); void printEdgeWeight(raw_ostream &OS, Edge E); void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); @@ -317,6 +351,7 @@ protected: void propagateWeights(Function &F); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); + std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG); bool propagateThroughEdges(Function &F, bool UpdateBlockCount); void computeDominanceAndLoopInfo(Function &F); void clearFunctionData(); @@ -869,21 +904,52 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) { getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC, None, nullptr, nullptr); if (Cost.isNever()) { - ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB) + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB) << "incompatible inlining"); return false; } InlineFunctionInfo IFI(nullptr, &GetAC); if (InlineFunction(CS, IFI)) { // The call to InlineFunction erases I, so we can't pass it here. - ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB) - << "inlined hot callee '" << ore::NV("Callee", CalledFunction) + ORE->emit(OptimizationRemark(CSINLINE_DEBUG, "InlineSuccess", DLoc, BB) + << "inlined callee '" << ore::NV("Callee", CalledFunction) << "' into '" << ore::NV("Caller", BB->getParent()) << "'"); return true; } return false; } +bool SampleProfileLoader::shouldInlineColdCallee(Instruction &CallInst) { + if (!ProfileSizeInline) + return false; + + Function *Callee = CallSite(&CallInst).getCalledFunction(); + if (Callee == nullptr) + return false; + + InlineCost Cost = + getInlineCost(cast<CallBase>(CallInst), getInlineParams(), + GetTTI(*Callee), GetAC, None, nullptr, nullptr); + + return Cost.getCost() <= SampleColdCallSiteThreshold; +} + +void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( + const SmallVector<Instruction *, 10> &Candidates, const Function &F, + bool Hot) { + for (auto I : Candidates) { + Function *CalledFunction = CallSite(I).getCalledFunction(); + if (CalledFunction) { + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt", + I->getDebugLoc(), I->getParent()) + << "previous inlining reattempted for " + << (Hot ? "hotness: '" : "size: '") + << ore::NV("Callee", CalledFunction) << "' into '" + << ore::NV("Caller", &F) << "'"); + } + } +} + /// Iteratively inline hot callsites of a function. /// /// Iteratively traverse all callsites of the function \p F, and find if @@ -916,20 +982,28 @@ bool SampleProfileLoader::inlineHotFunctions( SmallVector<Instruction *, 10> CIS; for (auto &BB : F) { bool Hot = false; - SmallVector<Instruction *, 10> Candidates; + SmallVector<Instruction *, 10> AllCandidates; + SmallVector<Instruction *, 10> ColdCandidates; for (auto &I : BB.getInstList()) { const FunctionSamples *FS = nullptr; if ((isa<CallInst>(I) || isa<InvokeInst>(I)) && !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) { - Candidates.push_back(&I); + AllCandidates.push_back(&I); if (FS->getEntrySamples() > 0) localNotInlinedCallSites.try_emplace(&I, FS); if (callsiteIsHot(FS, PSI)) Hot = true; + else if (shouldInlineColdCallee(I)) + ColdCandidates.push_back(&I); } } if (Hot) { - CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end()); + CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end()); + emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true); + } + else { + CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end()); + emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false); } } for (auto I : CIS) { @@ -975,6 +1049,7 @@ bool SampleProfileLoader::inlineHotFunctions( inlineCallInstruction(DI)) { localNotInlinedCallSites.erase(I); LocalChanged = true; + ++NumCSInlined; } } else { LLVM_DEBUG(dbgs() @@ -987,6 +1062,7 @@ bool SampleProfileLoader::inlineHotFunctions( if (inlineCallInstruction(I)) { localNotInlinedCallSites.erase(I); LocalChanged = true; + ++NumCSInlined; } } else if (IsThinLTOPreLink) { findCalleeFunctionSamples(*I)->findInlinedFunctions( @@ -1006,10 +1082,35 @@ bool SampleProfileLoader::inlineHotFunctions( Function *Callee = CallSite(I).getCalledFunction(); if (!Callee || Callee->isDeclaration()) continue; + + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline", + I->getDebugLoc(), I->getParent()) + << "previous inlining not repeated: '" + << ore::NV("Callee", Callee) << "' into '" + << ore::NV("Caller", &F) << "'"); + + ++NumCSNotInlined; const FunctionSamples *FS = Pair.getSecond(); - auto pair = - notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); - pair.first->second.entryCount += FS->getEntrySamples(); + if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { + continue; + } + + if (ProfileMergeInlinee) { + // Use entry samples as head samples during the merge, as inlinees + // don't have head samples. + assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee"); + const_cast<FunctionSamples *>(FS)->addHeadSamples(FS->getEntrySamples()); + + // Note that we have to do the merge right after processing function. + // This allows OutlineFS's profile to be used for annotation during + // top-down processing of functions' annotation. + FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); + OutlineFS->merge(*FS); + } else { + auto pair = + notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); + pair.first->second.entryCount += FS->getEntrySamples(); + } } return Changed; } @@ -1673,6 +1774,33 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) +std::vector<Function *> +SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { + std::vector<Function *> FunctionOrderList; + FunctionOrderList.reserve(M.size()); + + if (!ProfileTopDownLoad || CG == nullptr) { + for (Function &F : M) + if (!F.isDeclaration()) + FunctionOrderList.push_back(&F); + return FunctionOrderList; + } + + assert(&CG->getModule() == &M); + scc_iterator<CallGraph *> CGI = scc_begin(CG); + while (!CGI.isAtEnd()) { + for (CallGraphNode *node : *CGI) { + auto F = node->getFunction(); + if (F && !F->isDeclaration()) + FunctionOrderList.push_back(F); + } + ++CGI; + } + + std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); + return FunctionOrderList; +} + bool SampleProfileLoader::doInitialization(Module &M) { auto &Ctx = M.getContext(); @@ -1710,7 +1838,7 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { } bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, - ProfileSummaryInfo *_PSI) { + ProfileSummaryInfo *_PSI, CallGraph *CG) { GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); if (!ProfileIsValid) return false; @@ -1745,11 +1873,11 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, } bool retval = false; - for (auto &F : M) - if (!F.isDeclaration()) { - clearFunctionData(); - retval |= runOnFunction(F, AM); - } + for (auto F : buildFunctionOrder(M, CG)) { + assert(!F->isDeclaration()); + clearFunctionData(); + retval |= runOnFunction(*F, AM); + } // Account for cold calls not inlined.... for (const std::pair<Function *, NotInlinedProfileInfo> &pair : @@ -1764,7 +1892,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); ProfileSummaryInfo *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - return SampleLoader.runOnModule(M, nullptr, PSI); + return SampleLoader.runOnModule(M, nullptr, PSI, nullptr); } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { @@ -1845,10 +1973,12 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M, : ProfileRemappingFileName, IsThinLTOPreLink, GetAssumptionCache, GetTTI); - SampleLoader.doInitialization(M); + if (!SampleLoader.doInitialization(M)) + return PreservedAnalyses::all(); ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); - if (!SampleLoader.runOnModule(M, &AM, PSI)) + CallGraph &CG = AM.getResult<CallGraphAnalysis>(M); + if (!SampleLoader.runOnModule(M, &AM, PSI, &CG)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp index 106db3c8bd9d..655a7a404951 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -16,6 +16,7 @@ #include "llvm/Transforms/IPO/StripDeadPrototypes.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp index 67a473612fc1..6ce00714523b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -20,7 +20,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" @@ -28,8 +27,10 @@ #include "llvm/IR/Module.h" #include "llvm/IR/TypeFinder.h" #include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; namespace { diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 690b5e8bf49e..87a18171787f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Object/ModuleSymbolTable.h" #include "llvm/Pass.h" #include "llvm/Support/ScopedPrinter.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index f0cf5581ba8a..5ccfb29b01a1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -80,10 +80,12 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndexYAML.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MathExtras.h" @@ -486,7 +488,7 @@ struct DevirtModule { bool areRemarksEnabled(); - void scanTypeTestUsers(Function *TypeTestFunc, Function *AssumeFunc); + void scanTypeTestUsers(Function *TypeTestFunc); void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc); void buildTypeIdentifierMap( @@ -615,8 +617,8 @@ struct WholeProgramDevirt : public ModulePass { bool UseCommandLine = false; - ModuleSummaryIndex *ExportSummary; - const ModuleSummaryIndex *ImportSummary; + ModuleSummaryIndex *ExportSummary = nullptr; + const ModuleSummaryIndex *ImportSummary = nullptr; WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) { initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); @@ -709,7 +711,7 @@ void runWholeProgramDevirtOnIndex( void updateIndexWPDForExports( ModuleSummaryIndex &Summary, - function_ref<bool(StringRef, GlobalValue::GUID)> isExported, + function_ref<bool(StringRef, ValueInfo)> isExported, std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) { for (auto &T : LocalWPDTargetsMap) { auto &VI = T.first; @@ -717,7 +719,7 @@ void updateIndexWPDForExports( assert(VI.getSummaryList().size() == 1 && "Devirt of local target has more than one copy"); auto &S = VI.getSummaryList()[0]; - if (!isExported(S->modulePath(), VI.getGUID())) + if (!isExported(S->modulePath(), VI)) continue; // It's been exported by a cross module import. @@ -840,18 +842,31 @@ bool DevirtModule::tryFindVirtualCallTargets( bool DevirtIndex::tryFindVirtualCallTargets( std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) { - for (const TypeIdOffsetVtableInfo P : TIdInfo) { - // VTable initializer should have only one summary, or all copies must be - // linkonce/weak ODR. - assert(P.VTableVI.getSummaryList().size() == 1 || - llvm::all_of( - P.VTableVI.getSummaryList(), - [&](const std::unique_ptr<GlobalValueSummary> &Summary) { - return GlobalValue::isLinkOnceODRLinkage(Summary->linkage()) || - GlobalValue::isWeakODRLinkage(Summary->linkage()); - })); - const auto *VS = cast<GlobalVarSummary>(P.VTableVI.getSummaryList()[0].get()); - if (!P.VTableVI.getSummaryList()[0]->isLive()) + for (const TypeIdOffsetVtableInfo &P : TIdInfo) { + // Find the first non-available_externally linkage vtable initializer. + // We can have multiple available_externally, linkonce_odr and weak_odr + // vtable initializers, however we want to skip available_externally as they + // do not have type metadata attached, and therefore the summary will not + // contain any vtable functions. We can also have multiple external + // vtable initializers in the case of comdats, which we cannot check here. + // The linker should give an error in this case. + // + // Also, handle the case of same-named local Vtables with the same path + // and therefore the same GUID. This can happen if there isn't enough + // distinguishing path when compiling the source file. In that case we + // conservatively return false early. + const GlobalVarSummary *VS = nullptr; + bool LocalFound = false; + for (auto &S : P.VTableVI.getSummaryList()) { + if (GlobalValue::isLocalLinkage(S->linkage())) { + if (LocalFound) + return false; + LocalFound = true; + } + if (!GlobalValue::isAvailableExternallyLinkage(S->linkage())) + VS = cast<GlobalVarSummary>(S->getBaseObject()); + } + if (!VS->isLive()) continue; for (auto VTP : VS->vTableFuncs()) { if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset) @@ -1233,8 +1248,7 @@ std::string DevirtModule::getGlobalName(VTableSlot Slot, bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() { Triple T(M.getTargetTriple()); - return (T.getArch() == Triple::x86 || T.getArch() == Triple::x86_64) && - T.getObjectFormat() == Triple::ELF; + return T.isX86() && T.getObjectFormat() == Triple::ELF; } void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, @@ -1548,8 +1562,7 @@ bool DevirtModule::areRemarksEnabled() { return false; } -void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc, - Function *AssumeFunc) { +void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc) { // Find all virtual calls via a virtual table pointer %p under an assumption // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p // points to a member of the type identifier %md. Group calls by (type ID, @@ -1784,7 +1797,7 @@ bool DevirtModule::run() { return false; if (TypeTestFunc && AssumeFunc) - scanTypeTestUsers(TypeTestFunc, AssumeFunc); + scanTypeTestUsers(TypeTestFunc); if (TypeCheckedLoadFunc) scanTypeCheckedLoadUsers(TypeCheckedLoadFunc); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 8bc34825f8a7..ec976a971e3c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -890,6 +890,10 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) { if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->getScalarSizeInBits() == 1) return SelectInst::Create(X, AddOne(Op1C), Op1); + // sext(bool) + C -> bool ? C - 1 : C + if (match(Op0, m_SExt(m_Value(X))) && + X->getType()->getScalarSizeInBits() == 1) + return SelectInst::Create(X, SubOne(Op1C), Op1); // ~X + C --> (C-1) - X if (match(Op0, m_Not(m_Value(X)))) @@ -1288,12 +1292,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { return BinaryOperator::CreateSub(RHS, A); } - // Canonicalize sext to zext for better value tracking potential. - // add A, sext(B) --> sub A, zext(B) - if (match(&I, m_c_Add(m_Value(A), m_OneUse(m_SExt(m_Value(B))))) && - B->getType()->isIntOrIntVectorTy(1)) - return BinaryOperator::CreateSub(A, Builder.CreateZExt(B, Ty)); - // A + -B --> A - B if (match(RHS, m_Neg(m_Value(B)))) return BinaryOperator::CreateSub(LHS, B); @@ -1587,7 +1585,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, - Type *Ty) { + Type *Ty, bool IsNUW) { // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize // this. bool Swapped = false; @@ -1655,6 +1653,15 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, // Emit the offset of the GEP and an intptr_t. Value *Result = EmitGEPOffset(GEP1); + // If this is a single inbounds GEP and the original sub was nuw, + // then the final multiplication is also nuw. We match an extra add zero + // here, because that's what EmitGEPOffset() generates. + Instruction *I; + if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() && + match(Result, m_Add(m_Instruction(I), m_Zero())) && + I->getOpcode() == Instruction::Mul) + I->setHasNoUnsignedWrap(); + // If we had a constant expression GEP on the other side offsetting the // pointer, subtract it from the offset we have. if (GEP2) { @@ -1881,6 +1888,74 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Y, Builder.CreateNot(Op1, Op1->getName() + ".not")); } + { + // (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1)) + Value *X; + if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1), + m_OneUse(m_Neg(m_Value(X))))))) { + return BinaryOperator::CreateNeg(Builder.CreateAnd( + Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType())))); + } + } + + { + // (sub (and Op1, C), Op1) --> neg (and Op1, ~C) + Constant *C; + if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) { + return BinaryOperator::CreateNeg( + Builder.CreateAnd(Op1, Builder.CreateNot(C))); + } + } + + { + // If we have a subtraction between some value and a select between + // said value and something else, sink subtraction into select hands, i.e.: + // sub (select %Cond, %TrueVal, %FalseVal), %Op1 + // -> + // select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1) + // or + // sub %Op0, (select %Cond, %TrueVal, %FalseVal) + // -> + // select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal) + // This will result in select between new subtraction and 0. + auto SinkSubIntoSelect = + [Ty = I.getType()](Value *Select, Value *OtherHandOfSub, + auto SubBuilder) -> Instruction * { + Value *Cond, *TrueVal, *FalseVal; + if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal), + m_Value(FalseVal))))) + return nullptr; + if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal) + return nullptr; + // While it is really tempting to just create two subtractions and let + // InstCombine fold one of those to 0, it isn't possible to do so + // because of worklist visitation order. So ugly it is. + bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal; + Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal); + Constant *Zero = Constant::getNullValue(Ty); + SelectInst *NewSel = + SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub, + OtherHandOfSubIsTrueVal ? NewSub : Zero); + // Preserve prof metadata if any. + NewSel->copyMetadata(cast<Instruction>(*Select)); + return NewSel; + }; + if (Instruction *NewSel = SinkSubIntoSelect( + /*Select=*/Op0, /*OtherHandOfSub=*/Op1, + [Builder = &Builder, Op1](Value *OtherHandOfSelect) { + return Builder->CreateSub(OtherHandOfSelect, + /*OtherHandOfSub=*/Op1); + })) + return NewSel; + if (Instruction *NewSel = SinkSubIntoSelect( + /*Select=*/Op1, /*OtherHandOfSub=*/Op0, + [Builder = &Builder, Op0](Value *OtherHandOfSelect) { + return Builder->CreateSub(/*OtherHandOfSub=*/Op0, + OtherHandOfSelect); + })) + return NewSel; + } + if (Op1->hasOneUse()) { Value *X = nullptr, *Y = nullptr, *Z = nullptr; Constant *C = nullptr; @@ -1896,14 +1971,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Builder.CreateNot(Y, Y->getName() + ".not")); // 0 - (X sdiv C) -> (X sdiv -C) provided the negation doesn't overflow. - // TODO: This could be extended to match arbitrary vector constants. - const APInt *DivC; - if (match(Op0, m_Zero()) && match(Op1, m_SDiv(m_Value(X), m_APInt(DivC))) && - !DivC->isMinSignedValue() && *DivC != 1) { - Constant *NegDivC = ConstantInt::get(I.getType(), -(*DivC)); - Instruction *BO = BinaryOperator::CreateSDiv(X, NegDivC); - BO->setIsExact(cast<BinaryOperator>(Op1)->isExact()); - return BO; + if (match(Op0, m_Zero())) { + Constant *Op11C; + if (match(Op1, m_SDiv(m_Value(X), m_Constant(Op11C))) && + !Op11C->containsUndefElement() && Op11C->isNotMinSignedValue() && + Op11C->isNotOneValue()) { + Instruction *BO = + BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(Op11C)); + BO->setIsExact(cast<BinaryOperator>(Op1)->isExact()); + return BO; + } } // 0 - (X << Y) -> (-X << Y) when X is freely negatable. @@ -1921,6 +1998,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Add->setHasNoSignedWrap(I.hasNoSignedWrap()); return Add; } + // sub [nsw] X, zext(bool Y) -> add [nsw] X, sext(bool Y) + // 'nuw' is dropped in favor of the canonical form. + if (match(Op1, m_ZExt(m_Value(Y))) && Y->getType()->isIntOrIntVectorTy(1)) { + Value *Sext = Builder.CreateSExt(Y, I.getType()); + BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Sext); + Add->setHasNoSignedWrap(I.hasNoSignedWrap()); + return Add; + } // X - A*-B -> X + A*B // X - -A*B -> X + A*B @@ -1975,13 +2060,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Value *LHSOp, *RHSOp; if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && match(Op1, m_PtrToInt(m_Value(RHSOp)))) - if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), + I.hasNoUnsignedWrap())) return replaceInstUsesWith(I, Res); // trunc(p)-trunc(q) -> trunc(p-q) if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) - if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), + /* IsNUW */ false)) return replaceInstUsesWith(I, Res); // Canonicalize a shifty way to code absolute value to the common pattern. diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 4a30b60ca931..cc0a9127f8b1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3279,6 +3279,23 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { NotLHS, NotRHS); } } + + // Pull 'not' into operands of select if both operands are one-use compares. + // Inverting the predicates eliminates the 'not' operation. + // Example: + // not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) --> + // select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?) + // TODO: Canonicalize by hoisting 'not' into an arm of the select if only + // 1 select operand is a cmp? + if (auto *Sel = dyn_cast<SelectInst>(Op0)) { + auto *CmpT = dyn_cast<CmpInst>(Sel->getTrueValue()); + auto *CmpF = dyn_cast<CmpInst>(Sel->getFalseValue()); + if (CmpT && CmpF && CmpT->hasOneUse() && CmpF->hasOneUse()) { + CmpT->setPredicate(CmpT->getInversePredicate()); + CmpF->setPredicate(CmpF->getInversePredicate()); + return replaceInstUsesWith(I, Sel); + } + } } if (Instruction *NewXor = sinkNotIntoXor(I, Builder)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index c650d242cd50..f463c5fa1138 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -40,6 +40,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" @@ -2279,6 +2285,35 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::copysign: { + if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) { + // If we know that the sign argument is positive, reduce to FABS: + // copysign X, Pos --> fabs X + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, + II->getArgOperand(0), II); + return replaceInstUsesWith(*II, Fabs); + } + // TODO: There should be a ValueTracking sibling like SignBitMustBeOne. + const APFloat *C; + if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) { + // If we know that the sign argument is negative, reduce to FNABS: + // copysign X, Neg --> fneg (fabs X) + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, + II->getArgOperand(0), II); + return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II)); + } + + // Propagate sign argument through nested calls: + // copysign X, (copysign ?, SignArg) --> copysign X, SignArg + Value *SignArg; + if (match(II->getArgOperand(1), + m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) { + II->setArgOperand(1, SignArg); + return II; + } + + break; + } case Intrinsic::fabs: { Value *Cond; Constant *LHS, *RHS; @@ -2452,6 +2487,64 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // TODO should we convert this to an AND if the RHS is constant? } break; + case Intrinsic::x86_bmi_pext_32: + case Intrinsic::x86_bmi_pext_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + if (MaskC->isNullValue()) + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); + if (MaskC->isAllOnesValue()) + return replaceInstUsesWith(CI, II->getArgOperand(0)); + + if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToSet = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToTest = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToSet <<= 1; + // Clear lowest set bit. + Mask &= Mask - 1; + } + + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); + } + } + break; + case Intrinsic::x86_bmi_pdep_32: + case Intrinsic::x86_bmi_pdep_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + if (MaskC->isNullValue()) + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); + if (MaskC->isAllOnesValue()) + return replaceInstUsesWith(CI, II->getArgOperand(0)); + + if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToTest = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToSet = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToTest <<= 1; + // Clear lowest set bit; + Mask &= Mask - 1; + } + + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); + } + } + break; case Intrinsic::x86_vcvtph2ps_128: case Intrinsic::x86_vcvtph2ps_256: { @@ -3308,6 +3401,60 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; } + case Intrinsic::arm_mve_pred_i2v: { + Value *Arg = II->getArgOperand(0); + Value *ArgArg; + if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) && + II->getType() == ArgArg->getType()) + return replaceInstUsesWith(*II, ArgArg); + Constant *XorMask; + if (match(Arg, + m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)), + m_Constant(XorMask))) && + II->getType() == ArgArg->getType()) { + if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { + if (CI->getValue().trunc(16).isAllOnesValue()) { + auto TrueVector = Builder.CreateVectorSplat( + II->getType()->getVectorNumElements(), Builder.getTrue()); + return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); + } + } + } + KnownBits ScalarKnown(32); + if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16), + ScalarKnown, 0)) + return II; + break; + } + case Intrinsic::arm_mve_pred_v2i: { + Value *Arg = II->getArgOperand(0); + Value *ArgArg; + if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg)))) + return replaceInstUsesWith(*II, ArgArg); + if (!II->getMetadata(LLVMContext::MD_range)) { + Type *IntTy32 = Type::getInt32Ty(II->getContext()); + Metadata *M[] = { + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF)) + }; + II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); + return II; + } + break; + } + case Intrinsic::arm_mve_vadc: + case Intrinsic::arm_mve_vadc_predicated: { + unsigned CarryOp = + (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; + assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && + "Bad type for intrinsic!"); + + KnownBits CarryKnown(32); + if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29), + CarryKnown)) + return II; + break; + } case Intrinsic::amdgcn_rcp: { Value *Src = II->getArgOperand(0); @@ -3317,7 +3464,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { const APFloat &ArgVal = C->getValueAPF(); - APFloat Val(ArgVal.getSemantics(), 1.0); + APFloat Val(ArgVal.getSemantics(), 1); APFloat::opStatus Status = Val.divide(ArgVal, APFloat::rmNearestTiesToEven); // Only do this if it was exact and therefore not dependent on the @@ -3872,7 +4019,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return eraseInstFromFunction(CI); // Bail if we cross over an intrinsic with side effects, such as - // llvm.stacksave, llvm.read_register, or llvm.setjmp. + // llvm.stacksave, or llvm.read_register. if (II2->mayHaveSideEffects()) { CannotRemove = true; break; @@ -4019,12 +4166,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Is this guard followed by another guard? We scan forward over a small // fixed window of instructions to handle common cases with conditions // computed between guards. - Instruction *NextInst = II->getNextNode(); + Instruction *NextInst = II->getNextNonDebugInstruction(); for (unsigned i = 0; i < GuardWideningWindow; i++) { // Note: Using context-free form to avoid compile time blow up if (!isSafeToSpeculativelyExecute(NextInst)) break; - NextInst = NextInst->getNextNode(); + NextInst = NextInst->getNextNonDebugInstruction(); } Value *NextCond = nullptr; if (match(NextInst, @@ -4032,18 +4179,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *CurrCond = II->getArgOperand(0); // Remove a guard that it is immediately preceded by an identical guard. - if (CurrCond == NextCond) - return eraseInstFromFunction(*NextInst); - // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). - Instruction* MoveI = II->getNextNode(); - while (MoveI != NextInst) { - auto *Temp = MoveI; - MoveI = MoveI->getNextNode(); - Temp->moveBefore(II); + if (CurrCond != NextCond) { + Instruction *MoveI = II->getNextNonDebugInstruction(); + while (MoveI != NextInst) { + auto *Temp = MoveI; + MoveI = MoveI->getNextNonDebugInstruction(); + Temp->moveBefore(II); + } + II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); } - II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); - return eraseInstFromFunction(*NextInst); + eraseInstFromFunction(*NextInst); + return II; } break; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 65aaef28d87a..71b7f279e5fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/DIBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" +#include <numeric> using namespace llvm; using namespace PatternMatch; @@ -843,33 +844,33 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { return nullptr; } -Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, +Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext, bool DoTransform) { // If we are just checking for a icmp eq of a single bit and zext'ing it // to an integer, then shift the bit to the appropriate place and then // cast to integer to avoid the comparison. const APInt *Op1CV; - if (match(ICI->getOperand(1), m_APInt(Op1CV))) { + if (match(Cmp->getOperand(1), m_APInt(Op1CV))) { // zext (x <s 0) to i32 --> x>>u31 true if signbit set. // zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear. - if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) || - (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) { - if (!DoTransform) return ICI; + if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) || + (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) { + if (!DoTransform) return Cmp; - Value *In = ICI->getOperand(0); + Value *In = Cmp->getOperand(0); Value *Sh = ConstantInt::get(In->getType(), In->getType()->getScalarSizeInBits() - 1); In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit"); - if (In->getType() != CI.getType()) - In = Builder.CreateIntCast(In, CI.getType(), false /*ZExt*/); + if (In->getType() != Zext.getType()) + In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/); - if (ICI->getPredicate() == ICmpInst::ICMP_SGT) { + if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) { Constant *One = ConstantInt::get(In->getType(), 1); In = Builder.CreateXor(In, One, In->getName() + ".not"); } - return replaceInstUsesWith(CI, In); + return replaceInstUsesWith(Zext, In); } // zext (X == 0) to i32 --> X^1 iff X has only the low bit set. @@ -882,24 +883,24 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) && // This only works for EQ and NE - ICI->isEquality()) { + Cmp->isEquality()) { // If Op1C some other power of two, convert: - KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI); + KnownBits Known = computeKnownBits(Cmp->getOperand(0), 0, &Zext); APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? - if (!DoTransform) return ICI; + if (!DoTransform) return Cmp; - bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE; + bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE; if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) { // (X&4) == 2 --> false // (X&4) != 2 --> true - Constant *Res = ConstantInt::get(CI.getType(), isNE); - return replaceInstUsesWith(CI, Res); + Constant *Res = ConstantInt::get(Zext.getType(), isNE); + return replaceInstUsesWith(Zext, Res); } uint32_t ShAmt = KnownZeroMask.logBase2(); - Value *In = ICI->getOperand(0); + Value *In = Cmp->getOperand(0); if (ShAmt) { // Perform a logical shr by shiftamt. // Insert the shift to put the result in the low bit. @@ -912,11 +913,11 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, In = Builder.CreateXor(In, One); } - if (CI.getType() == In->getType()) - return replaceInstUsesWith(CI, In); + if (Zext.getType() == In->getType()) + return replaceInstUsesWith(Zext, In); - Value *IntCast = Builder.CreateIntCast(In, CI.getType(), false); - return replaceInstUsesWith(CI, IntCast); + Value *IntCast = Builder.CreateIntCast(In, Zext.getType(), false); + return replaceInstUsesWith(Zext, IntCast); } } } @@ -924,19 +925,19 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // icmp ne A, B is equal to xor A, B when A and B only really have one bit. // It is also profitable to transform icmp eq into not(xor(A, B)) because that // may lead to additional simplifications. - if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) { - if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) { - Value *LHS = ICI->getOperand(0); - Value *RHS = ICI->getOperand(1); + if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) { + if (IntegerType *ITy = dyn_cast<IntegerType>(Zext.getType())) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); - KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI); - KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI); + KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext); + KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext); if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) { APInt KnownBits = KnownLHS.Zero | KnownLHS.One; APInt UnknownBit = ~KnownBits; if (UnknownBit.countPopulation() == 1) { - if (!DoTransform) return ICI; + if (!DoTransform) return Cmp; Value *Result = Builder.CreateXor(LHS, RHS); @@ -949,10 +950,10 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, Result = Builder.CreateLShr( Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros())); - if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + if (Cmp->getPredicate() == ICmpInst::ICMP_EQ) Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1)); - Result->takeName(ICI); - return replaceInstUsesWith(CI, Result); + Result->takeName(Cmp); + return replaceInstUsesWith(Zext, Result); } } } @@ -1172,8 +1173,8 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { } } - if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) - return transformZExtICmp(ICI, CI); + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src)) + return transformZExtICmp(Cmp, CI); BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src); if (SrcI && SrcI->getOpcode() == Instruction::Or) { @@ -1188,7 +1189,9 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName()); Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName()); - BinaryOperator *Or = BinaryOperator::Create(Instruction::Or, LCast, RCast); + Value *Or = Builder.CreateOr(LCast, RCast, CI.getName()); + if (auto *OrInst = dyn_cast<Instruction>(Or)) + Builder.SetInsertPoint(OrInst); // Perform the elimination. if (auto *LZExt = dyn_cast<ZExtInst>(LCast)) @@ -1196,7 +1199,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { if (auto *RZExt = dyn_cast<ZExtInst>(RCast)) transformZExtICmp(RHS, *RZExt); - return Or; + return replaceInstUsesWith(CI, Or); } } @@ -1621,6 +1624,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { Value *X; Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0)); if (Op && Op->hasOneUse()) { + // FIXME: The FMF should propagate from the fptrunc, not the source op. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (isa<FPMathOperator>(Op)) + Builder.setFastMathFlags(Op->getFastMathFlags()); + if (match(Op, m_FNeg(m_Value(X)))) { Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty); @@ -1630,6 +1638,24 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { return BinaryOperator::CreateFNegFMF(InnerTrunc, Op); return UnaryOperator::CreateFNegFMF(InnerTrunc, Op); } + + // If we are truncating a select that has an extended operand, we can + // narrow the other operand and do the select as a narrow op. + Value *Cond, *X, *Y; + if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) && + X->getType() == Ty) { + // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y) + Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); + Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op); + return replaceInstUsesWith(FPT, Sel); + } + if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) && + X->getType() == Ty) { + // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X + Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); + Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op); + return replaceInstUsesWith(FPT, Sel); + } } if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) { @@ -1808,7 +1834,7 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { Type *Ty = CI.getType(); unsigned AS = CI.getPointerAddressSpace(); - if (Ty->getScalarSizeInBits() == DL.getIndexSizeInBits(AS)) + if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS)) return commonPointerCastTransforms(CI); Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS); @@ -1820,12 +1846,24 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { } /// This input value (which is known to have vector type) is being zero extended -/// or truncated to the specified vector type. +/// or truncated to the specified vector type. Since the zext/trunc is done +/// using an integer type, we have a (bitcast(cast(bitcast))) pattern, +/// endianness will impact which end of the vector that is extended or +/// truncated. +/// +/// A vector is always stored with index 0 at the lowest address, which +/// corresponds to the most significant bits for a big endian stored integer and +/// the least significant bits for little endian. A trunc/zext of an integer +/// impacts the big end of the integer. Thus, we need to add/remove elements at +/// the front of the vector for big endian targets, and the back of the vector +/// for little endian targets. +/// /// Try to replace it with a shuffle (and vector/vector bitcast) if possible. /// /// The source and destination vector types may have different element types. -static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, - InstCombiner &IC) { +static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal, + VectorType *DestTy, + InstCombiner &IC) { // We can only do this optimization if the output is a multiple of the input // element size, or the input is a multiple of the output element size. // Convert the input type to have the same element type as the output. @@ -1844,31 +1882,53 @@ static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, InVal = IC.Builder.CreateBitCast(InVal, SrcTy); } + bool IsBigEndian = IC.getDataLayout().isBigEndian(); + unsigned SrcElts = SrcTy->getNumElements(); + unsigned DestElts = DestTy->getNumElements(); + + assert(SrcElts != DestElts && "Element counts should be different."); + // Now that the element types match, get the shuffle mask and RHS of the // shuffle to use, which depends on whether we're increasing or decreasing the // size of the input. - SmallVector<uint32_t, 16> ShuffleMask; + SmallVector<uint32_t, 16> ShuffleMaskStorage; + ArrayRef<uint32_t> ShuffleMask; Value *V2; - if (SrcTy->getNumElements() > DestTy->getNumElements()) { - // If we're shrinking the number of elements, just shuffle in the low - // elements from the input and use undef as the second shuffle input. - V2 = UndefValue::get(SrcTy); - for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i) - ShuffleMask.push_back(i); + // Produce an identify shuffle mask for the src vector. + ShuffleMaskStorage.resize(SrcElts); + std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0); + if (SrcElts > DestElts) { + // If we're shrinking the number of elements (rewriting an integer + // truncate), just shuffle in the elements corresponding to the least + // significant bits from the input and use undef as the second shuffle + // input. + V2 = UndefValue::get(SrcTy); + // Make sure the shuffle mask selects the "least significant bits" by + // keeping elements from back of the src vector for big endian, and from the + // front for little endian. + ShuffleMask = ShuffleMaskStorage; + if (IsBigEndian) + ShuffleMask = ShuffleMask.take_back(DestElts); + else + ShuffleMask = ShuffleMask.take_front(DestElts); } else { - // If we're increasing the number of elements, shuffle in all of the - // elements from InVal and fill the rest of the result elements with zeros - // from a constant zero. + // If we're increasing the number of elements (rewriting an integer zext), + // shuffle in all of the elements from InVal. Fill the rest of the result + // elements with zeros from a constant zero. V2 = Constant::getNullValue(SrcTy); - unsigned SrcElts = SrcTy->getNumElements(); - for (unsigned i = 0, e = SrcElts; i != e; ++i) - ShuffleMask.push_back(i); - - // The excess elements reference the first element of the zero input. - for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i) - ShuffleMask.push_back(SrcElts); + // Use first elt from V2 when indicating zero in the shuffle mask. + uint32_t NullElt = SrcElts; + // Extend with null values in the "most significant bits" by adding elements + // in front of the src vector for big endian, and at the back for little + // endian. + unsigned DeltaElts = DestElts - SrcElts; + if (IsBigEndian) + ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt); + else + ShuffleMaskStorage.append(DeltaElts, NullElt); + ShuffleMask = ShuffleMaskStorage; } return new ShuffleVectorInst(InVal, V2, @@ -2217,6 +2277,31 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { } } + // Check that each user of each old PHI node is something that we can + // rewrite, so that all of the old PHI nodes can be cleaned up afterwards. + for (auto *OldPN : OldPhiNodes) { + for (User *V : OldPN->users()) { + if (auto *SI = dyn_cast<StoreInst>(V)) { + if (!SI->isSimple() || SI->getOperand(0) != OldPN) + return nullptr; + } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { + // Verify it's a B->A cast. + Type *TyB = BCI->getOperand(0)->getType(); + Type *TyA = BCI->getType(); + if (TyA != DestTy || TyB != SrcTy) + return nullptr; + } else if (auto *PHI = dyn_cast<PHINode>(V)) { + // As long as the user is another old PHI node, then even if we don't + // rewrite it, the PHI web we're considering won't have any users + // outside itself, so it'll be dead. + if (OldPhiNodes.count(PHI) == 0) + return nullptr; + } else { + return nullptr; + } + } + } + // For each old PHI node, create a corresponding new PHI node with a type A. SmallDenseMap<PHINode *, PHINode *> NewPNodes; for (auto *OldPN : OldPhiNodes) { @@ -2234,9 +2319,14 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { if (auto *C = dyn_cast<Constant>(V)) { NewV = ConstantExpr::getBitCast(C, DestTy); } else if (auto *LI = dyn_cast<LoadInst>(V)) { - Builder.SetInsertPoint(LI->getNextNode()); - NewV = Builder.CreateBitCast(LI, DestTy); - Worklist.Add(LI); + // Explicitly perform load combine to make sure no opposing transform + // can remove the bitcast in the meantime and trigger an infinite loop. + Builder.SetInsertPoint(LI); + NewV = combineLoadToNewType(*LI, DestTy); + // Remove the old load and its use in the old phi, which itself becomes + // dead once the whole transform finishes. + replaceInstUsesWith(*LI, UndefValue::get(LI->getType())); + eraseInstFromFunction(*LI); } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { NewV = BCI->getOperand(0); } else if (auto *PrevPN = dyn_cast<PHINode>(V)) { @@ -2259,26 +2349,33 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { Instruction *RetVal = nullptr; for (auto *OldPN : OldPhiNodes) { PHINode *NewPN = NewPNodes[OldPN]; - for (User *V : OldPN->users()) { + for (auto It = OldPN->user_begin(), End = OldPN->user_end(); It != End; ) { + User *V = *It; + // We may remove this user, advance to avoid iterator invalidation. + ++It; if (auto *SI = dyn_cast<StoreInst>(V)) { - if (SI->isSimple() && SI->getOperand(0) == OldPN) { - Builder.SetInsertPoint(SI); - auto *NewBC = - cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy)); - SI->setOperand(0, NewBC); - Worklist.Add(SI); - assert(hasStoreUsersOnly(*NewBC)); - } + assert(SI->isSimple() && SI->getOperand(0) == OldPN); + Builder.SetInsertPoint(SI); + auto *NewBC = + cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy)); + SI->setOperand(0, NewBC); + Worklist.Add(SI); + assert(hasStoreUsersOnly(*NewBC)); } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { - // Verify it's a B->A cast. Type *TyB = BCI->getOperand(0)->getType(); Type *TyA = BCI->getType(); - if (TyA == DestTy && TyB == SrcTy) { - Instruction *I = replaceInstUsesWith(*BCI, NewPN); - if (BCI == &CI) - RetVal = I; - } + assert(TyA == DestTy && TyB == SrcTy); + (void) TyA; + (void) TyB; + Instruction *I = replaceInstUsesWith(*BCI, NewPN); + if (BCI == &CI) + RetVal = I; + } else if (auto *PHI = dyn_cast<PHINode>(V)) { + assert(OldPhiNodes.count(PHI) > 0); + (void) PHI; + } else { + llvm_unreachable("all uses should be handled"); } } } @@ -2374,8 +2471,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { CastInst *SrcCast = cast<CastInst>(Src); if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) if (isa<VectorType>(BCIn->getOperand(0)->getType())) - if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0), - cast<VectorType>(DestTy), *this)) + if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts( + BCIn->getOperand(0), cast<VectorType>(DestTy), *this)) return I; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index a9f64feb600c..f38dc436722d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2566,9 +2566,6 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp, Type *Ty = Add->getType(); CmpInst::Predicate Pred = Cmp.getPredicate(); - if (!Add->hasOneUse()) - return nullptr; - // If the add does not wrap, we can always adjust the compare by subtracting // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE // are canonicalized to SGT/SLT/UGT/ULT. @@ -2602,6 +2599,9 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp, return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower)); } + if (!Add->hasOneUse()) + return nullptr; + // X+C <u C2 -> (X & -C2) == C // iff C & (C2-1) == 0 // C2 is a power of 2 @@ -3364,6 +3364,23 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, llvm_unreachable("All possible folds are handled."); } + // The mask value may be a vector constant that has undefined elements. But it + // may not be safe to propagate those undefs into the new compare, so replace + // those elements by copying an existing, defined, and safe scalar constant. + Type *OpTy = M->getType(); + auto *VecC = dyn_cast<Constant>(M); + if (OpTy->isVectorTy() && VecC && VecC->containsUndefElement()) { + Constant *SafeReplacementConstant = nullptr; + for (unsigned i = 0, e = OpTy->getVectorNumElements(); i != e; ++i) { + if (!isa<UndefValue>(VecC->getAggregateElement(i))) { + SafeReplacementConstant = VecC->getAggregateElement(i); + break; + } + } + assert(SafeReplacementConstant && "Failed to find undef replacement"); + M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant); + } + return Builder.CreateICmp(DstPred, X, M); } @@ -4930,7 +4947,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { // Get scalar or pointer size. unsigned BitWidth = Ty->isIntOrIntVectorTy() ? Ty->getScalarSizeInBits() - : DL.getIndexTypeSizeInBits(Ty->getScalarType()); + : DL.getPointerTypeSizeInBits(Ty->getScalarType()); if (!BitWidth) return nullptr; @@ -5167,6 +5184,7 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); }; + Constant *SafeReplacementConstant = nullptr; if (auto *CI = dyn_cast<ConstantInt>(C)) { // Bail out if the constant can't be safely incremented/decremented. if (!ConstantIsOk(CI)) @@ -5186,12 +5204,23 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, auto *CI = dyn_cast<ConstantInt>(Elt); if (!CI || !ConstantIsOk(CI)) return llvm::None; + + if (!SafeReplacementConstant) + SafeReplacementConstant = CI; } } else { // ConstantExpr? return llvm::None; } + // It may not be safe to change a compare predicate in the presence of + // undefined elements, so replace those elements with the first safe constant + // that we found. + if (C->containsUndefElement()) { + assert(SafeReplacementConstant && "Replacement constant not set"); + C = Constant::replaceUndefsWith(C, SafeReplacementConstant); + } + CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); // Increment or decrement the constant. @@ -5374,6 +5403,36 @@ static Instruction *foldVectorCmp(CmpInst &Cmp, return nullptr; } +// extract(uadd.with.overflow(A, B), 0) ult A +// -> extract(uadd.with.overflow(A, B), 1) +static Instruction *foldICmpOfUAddOv(ICmpInst &I) { + CmpInst::Predicate Pred = I.getPredicate(); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + Value *UAddOv; + Value *A, *B; + auto UAddOvResultPat = m_ExtractValue<0>( + m_Intrinsic<Intrinsic::uadd_with_overflow>(m_Value(A), m_Value(B))); + if (match(Op0, UAddOvResultPat) && + ((Pred == ICmpInst::ICMP_ULT && (Op1 == A || Op1 == B)) || + (Pred == ICmpInst::ICMP_EQ && match(Op1, m_ZeroInt()) && + (match(A, m_One()) || match(B, m_One()))) || + (Pred == ICmpInst::ICMP_NE && match(Op1, m_AllOnes()) && + (match(A, m_AllOnes()) || match(B, m_AllOnes()))))) + // extract(uadd.with.overflow(A, B), 0) < A + // extract(uadd.with.overflow(A, 1), 0) == 0 + // extract(uadd.with.overflow(A, -1), 0) != -1 + UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand(); + else if (match(Op1, UAddOvResultPat) && + Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B)) + // A > extract(uadd.with.overflow(A, B), 0) + UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand(); + else + return nullptr; + + return ExtractValueInst::Create(UAddOv, 1); +} + Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { bool Changed = false; const SimplifyQuery Q = SQ.getWithInstruction(&I); @@ -5562,6 +5621,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpEquality(I)) return Res; + if (Instruction *Res = foldICmpOfUAddOv(I)) + return Res; + // The 'cmpxchg' instruction returns an aggregate containing the old value and // an i1 which indicates whether or not we successfully did the swap. // diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 1dbc06d92e7a..1a746cb87abb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -369,7 +369,8 @@ public: Instruction *visitFNeg(UnaryOperator &I); Instruction *visitAdd(BinaryOperator &I); Instruction *visitFAdd(BinaryOperator &I); - Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty); + Value *OptimizePointerDifference( + Value *LHS, Value *RHS, Type *Ty, bool isNUW); Instruction *visitSub(BinaryOperator &I); Instruction *visitFSub(BinaryOperator &I); Instruction *visitMul(BinaryOperator &I); @@ -446,6 +447,7 @@ public: Instruction *visitLandingPadInst(LandingPadInst &LI); Instruction *visitVAStartInst(VAStartInst &I); Instruction *visitVACopyInst(VACopyInst &I); + Instruction *visitFreeze(FreezeInst &I); /// Specify what to return for unhandled instructions. Instruction *visitInstruction(Instruction &I) { return nullptr; } @@ -465,6 +467,9 @@ public: /// \return true if successful. bool replacePointer(Instruction &I, Value *V); + LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy, + const Twine &Suffix = ""); + private: bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; bool shouldChangeType(Type *From, Type *To) const; @@ -705,7 +710,7 @@ public: Instruction *eraseInstFromFunction(Instruction &I) { LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n'); assert(I.use_empty() && "Cannot erase instruction that is used!"); - salvageDebugInfo(I); + salvageDebugInfoOrMarkUndef(I); // Make sure that we reprocess all operands now that we reduced their // use counts. diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 3a0e05832fcb..ebf9d24eecc4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -449,8 +449,8 @@ static bool isSupportedAtomicType(Type *Ty) { /// /// Note that this will create all of the instructions with whatever insert /// point the \c InstCombiner currently is using. -static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy, - const Twine &Suffix = "") { +LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy, + const Twine &Suffix) { assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) && "can't fold an atomic load to requested type"); @@ -460,10 +460,17 @@ static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewT if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) && NewPtr->getType()->getPointerElementType() == NewTy && NewPtr->getType()->getPointerAddressSpace() == AS)) - NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)); + NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)); - LoadInst *NewLoad = IC.Builder.CreateAlignedLoad( - NewTy, NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix); + unsigned Align = LI.getAlignment(); + if (!Align) + // If old load did not have an explicit alignment specified, + // manually preserve the implied (ABI) alignment of the load. + // Else we may inadvertently incorrectly over-promise alignment. + Align = getDataLayout().getABITypeAlignment(LI.getType()); + + LoadInst *NewLoad = Builder.CreateAlignedLoad( + NewTy, NewPtr, Align, LI.isVolatile(), LI.getName() + Suffix); NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); copyMetadataForLoad(*NewLoad, LI); return NewLoad; @@ -526,7 +533,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value /// Returns true if instruction represent minmax pattern like: /// select ((cmp load V1, load V2), V1, V2). -static bool isMinMaxWithLoads(Value *V) { +static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) { assert(V->getType()->isPointerTy() && "Expected pointer type."); // Ignore possible ty* to ixx* bitcast. V = peekThroughBitcast(V); @@ -540,6 +547,7 @@ static bool isMinMaxWithLoads(Value *V) { if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)), m_Value(LHS), m_Value(RHS)))) return false; + LoadTy = L1->getType(); return (match(L1, m_Load(m_Specific(LHS))) && match(L2, m_Load(m_Specific(RHS)))) || (match(L1, m_Load(m_Specific(RHS))) && @@ -585,20 +593,22 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { // size is a legal integer type. // Do not perform canonicalization if minmax pattern is found (to avoid // infinite loop). + Type *Dummy; if (!Ty->isIntegerTy() && Ty->isSized() && + !(Ty->isVectorTy() && Ty->getVectorIsScalable()) && DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) && DL.typeSizeEqualsStoreSize(Ty) && !DL.isNonIntegralPointerType(Ty) && !isMinMaxWithLoads( - peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true))) { + peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true), + Dummy)) { if (all_of(LI.users(), [&LI](User *U) { auto *SI = dyn_cast<StoreInst>(U); return SI && SI->getPointerOperand() != &LI && !SI->getPointerOperand()->isSwiftError(); })) { - LoadInst *NewLoad = combineLoadToNewType( - IC, LI, - Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty))); + LoadInst *NewLoad = IC.combineLoadToNewType( + LI, Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty))); // Replace all the stores with stores of the newly loaded value. for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) { auto *SI = cast<StoreInst>(*UI++); @@ -620,7 +630,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { if (auto* CI = dyn_cast<CastInst>(LI.user_back())) if (CI->isNoopCast(DL)) if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) { - LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy()); + LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy()); CI->replaceAllUsesWith(NewLoad); IC.eraseInstFromFunction(*CI); return &LI; @@ -648,8 +658,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { // If the struct only have one element, we unpack. auto NumElements = ST->getNumElements(); if (NumElements == 1) { - LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), - ".unpack"); + LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U), + ".unpack"); AAMDNodes AAMD; LI.getAAMetadata(AAMD); NewLoad->setAAMetadata(AAMD); @@ -698,7 +708,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { auto *ET = AT->getElementType(); auto NumElements = AT->getNumElements(); if (NumElements == 1) { - LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack"); + LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack"); AAMDNodes AAMD; LI.getAAMetadata(AAMD); NewLoad->setAAMetadata(AAMD); @@ -1322,7 +1332,14 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC, auto *LI = cast<LoadInst>(SI.getValueOperand()); if (!LI->getType()->isIntegerTy()) return false; - if (!isMinMaxWithLoads(LoadAddr)) + Type *CmpLoadTy; + if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy)) + return false; + + // Make sure we're not changing the size of the load/store. + const auto &DL = IC.getDataLayout(); + if (DL.getTypeStoreSizeInBits(LI->getType()) != + DL.getTypeStoreSizeInBits(CmpLoadTy)) return false; if (!all_of(LI->users(), [LI, LoadAddr](User *U) { @@ -1334,8 +1351,7 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC, return false; IC.Builder.SetInsertPoint(LI); - LoadInst *NewLI = combineLoadToNewType( - IC, *LI, LoadAddr->getType()->getPointerElementType()); + LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy); // Replace all the stores with stores of the newly loaded value. for (auto *UI : LI->users()) { auto *USI = cast<StoreInst>(UI); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 0b9128a9f5a1..2774e46151fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -1239,6 +1239,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I); return BinaryOperator::CreateFDivFMF(YZ, X, &I); } + // Z / (1.0 / Y) => (Y * Z) + // + // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The + // m_OneUse check is avoided because even in the case of the multiple uses + // for 1.0/Y, the number of instructions remain the same and a division is + // replaced by a multiplication. + if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) + return BinaryOperator::CreateFMulFMF(Y, Op0, &I); } if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) { @@ -1368,8 +1376,10 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { } // 1 urem X -> zext(X != 1) - if (match(Op0, m_One())) - return CastInst::CreateZExtOrBitCast(Builder.CreateICmpNE(Op1, Op0), Ty); + if (match(Op0, m_One())) { + Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1)); + return CastInst::CreateZExtOrBitCast(Cmp, Ty); + } // X urem C -> X < C ? X : X - C, where C >= signbit. if (match(Op1, m_Negative())) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index e0376b7582f3..74e015a4f1d4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -14,9 +14,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -180,13 +181,14 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) { "Not enough available ptr typed incoming values"); PHINode *MatchingPtrPHI = nullptr; unsigned NumPhis = 0; - for (auto II = BB->begin(), EI = BasicBlock::iterator(BB->getFirstNonPHI()); - II != EI; II++, NumPhis++) { + for (auto II = BB->begin(); II != BB->end(); II++, NumPhis++) { // FIXME: consider handling this in AggressiveInstCombine + PHINode *PtrPHI = dyn_cast<PHINode>(II); + if (!PtrPHI) + break; if (NumPhis > MaxNumPhis) return nullptr; - PHINode *PtrPHI = dyn_cast<PHINode>(II); - if (!PtrPHI || PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType()) + if (PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType()) continue; MatchingPtrPHI = PtrPHI; for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 9fc871e49b30..05a624fde86b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -704,16 +704,24 @@ static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI, assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) && "Unexpected isUnsigned predicate!"); - // Account for swapped form of subtraction: ((a > b) ? b - a : 0). + // Ensure the sub is of the form: + // (a > b) ? a - b : 0 -> usub.sat(a, b) + // (a > b) ? b - a : 0 -> -usub.sat(a, b) + // Checking for both a-b and a+(-b) as a constant. bool IsNegative = false; - if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A)))) + const APInt *C; + if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) || + (match(A, m_APInt(C)) && + match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C))))) IsNegative = true; - else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B)))) + else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) && + !(match(B, m_APInt(C)) && + match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C))))) return nullptr; - // If sub is used anywhere else, we wouldn't be able to eliminate it - // afterwards. - if (!TrueVal->hasOneUse()) + // If we are adding a negate and the sub and icmp are used anywhere else, we + // would end up with more instructions. + if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse()) return nullptr; // (a > b) ? a - b : 0 -> usub.sat(a, b) @@ -781,6 +789,13 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal, return Builder.CreateBinaryIntrinsic( Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1)); } + // The overflow may be detected via the add wrapping round. + if (match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) && + match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) { + // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y) + // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y) + return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, Cmp1, Y); + } return nullptr; } @@ -1725,6 +1740,128 @@ static Instruction *foldAddSubSelect(SelectInst &SI, return nullptr; } +/// Turn X + Y overflows ? -1 : X + Y -> uadd_sat X, Y +/// And X - Y overflows ? 0 : X - Y -> usub_sat X, Y +/// Along with a number of patterns similar to: +/// X + Y overflows ? (X < 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y +/// X - Y overflows ? (X > 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y +static Instruction * +foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) { + Value *CondVal = SI.getCondition(); + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + + WithOverflowInst *II; + if (!match(CondVal, m_ExtractValue<1>(m_WithOverflowInst(II))) || + !match(FalseVal, m_ExtractValue<0>(m_Specific(II)))) + return nullptr; + + Value *X = II->getLHS(); + Value *Y = II->getRHS(); + + auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) { + Type *Ty = Limit->getType(); + + ICmpInst::Predicate Pred; + Value *TrueVal, *FalseVal, *Op; + const APInt *C; + if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)), + m_Value(TrueVal), m_Value(FalseVal)))) + return false; + + auto IsZeroOrOne = [](const APInt &C) { + return C.isNullValue() || C.isOneValue(); + }; + auto IsMinMax = [&](Value *Min, Value *Max) { + APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits()); + APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits()); + return match(Min, m_SpecificInt(MinVal)) && + match(Max, m_SpecificInt(MaxVal)); + }; + + if (Op != X && Op != Y) + return false; + + if (IsAdd) { + // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + if (Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) && + IsMinMax(TrueVal, FalseVal)) + return true; + // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + if (Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) && + IsMinMax(FalseVal, TrueVal)) + return true; + } else { + // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + if (Op == X && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C + 1) && + IsMinMax(TrueVal, FalseVal)) + return true; + // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + if (Op == X && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 2) && + IsMinMax(FalseVal, TrueVal)) + return true; + // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + if (Op == Y && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) && + IsMinMax(FalseVal, TrueVal)) + return true; + // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + if (Op == Y && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) && + IsMinMax(TrueVal, FalseVal)) + return true; + } + + return false; + }; + + Intrinsic::ID NewIntrinsicID; + if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow && + match(TrueVal, m_AllOnes())) + // X + Y overflows ? -1 : X + Y -> uadd_sat X, Y + NewIntrinsicID = Intrinsic::uadd_sat; + else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow && + match(TrueVal, m_Zero())) + // X - Y overflows ? 0 : X - Y -> usub_sat X, Y + NewIntrinsicID = Intrinsic::usub_sat; + else if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow && + IsSignedSaturateLimit(TrueVal, /*IsAdd=*/true)) + // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + NewIntrinsicID = Intrinsic::sadd_sat; + else if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow && + IsSignedSaturateLimit(TrueVal, /*IsAdd=*/false)) + // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + NewIntrinsicID = Intrinsic::ssub_sat; + else + return nullptr; + + Function *F = + Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType()); + return CallInst::Create(F, {X, Y}); +} + Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) { Constant *C; if (!match(Sel.getTrueValue(), m_Constant(C)) && @@ -2296,7 +2433,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // See if we are selecting two values based on a comparison of the two values. if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) { - if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) { + Value *Cmp0 = FCI->getOperand(0), *Cmp1 = FCI->getOperand(1); + if ((Cmp0 == TrueVal && Cmp1 == FalseVal) || + (Cmp0 == FalseVal && Cmp1 == TrueVal)) { // Canonicalize to use ordered comparisons by swapping the select // operands. // @@ -2305,30 +2444,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); IRBuilder<>::FastMathFlagGuard FMFG(Builder); + // FIXME: The FMF should propagate from the select, not the fcmp. Builder.setFastMathFlags(FCI->getFastMathFlags()); - Value *NewCond = Builder.CreateFCmp(InvPred, TrueVal, FalseVal, - FCI->getName() + ".inv"); - - return SelectInst::Create(NewCond, FalseVal, TrueVal, - SI.getName() + ".p"); - } - - // NOTE: if we wanted to, this is where to detect MIN/MAX - } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){ - // Canonicalize to use ordered comparisons by swapping the select - // operands. - // - // e.g. - // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y - if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { - FCmpInst::Predicate InvPred = FCI->getInversePredicate(); - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(FCI->getFastMathFlags()); - Value *NewCond = Builder.CreateFCmp(InvPred, FalseVal, TrueVal, + Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1, FCI->getName() + ".inv"); - - return SelectInst::Create(NewCond, FalseVal, TrueVal, - SI.getName() + ".p"); + Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal); + return replaceInstUsesWith(SI, NewSel); } // NOTE: if we wanted to, this is where to detect MIN/MAX @@ -2391,6 +2512,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *Add = foldAddSubSelect(SI, Builder)) return Add; + if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder)) + return Add; // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) auto *TI = dyn_cast<Instruction>(TrueVal); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index 64294838644f..fbff5dd4a8cd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -138,24 +138,6 @@ Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts( return Ret; } -// Try to replace `undef` constants in C with Replacement. -static Constant *replaceUndefsWith(Constant *C, Constant *Replacement) { - if (C && match(C, m_Undef())) - return Replacement; - - if (auto *CV = dyn_cast<ConstantVector>(C)) { - llvm::SmallVector<Constant *, 32> NewOps(CV->getNumOperands()); - for (unsigned i = 0, NumElts = NewOps.size(); i != NumElts; ++i) { - Constant *EltC = CV->getOperand(i); - NewOps[i] = EltC && match(EltC, m_Undef()) ? Replacement : EltC; - } - return ConstantVector::get(NewOps); - } - - // Don't know how to deal with this constant. - return C; -} - // If we have some pattern that leaves only some low bits set, and then performs // left-shift of those bits, if none of the bits that are left after the final // shift are modified by the mask, we can omit the mask. @@ -180,10 +162,20 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, "The input must be 'shl'!"); Value *Masked, *ShiftShAmt; - match(OuterShift, m_Shift(m_Value(Masked), m_Value(ShiftShAmt))); + match(OuterShift, + m_Shift(m_Value(Masked), m_ZExtOrSelf(m_Value(ShiftShAmt)))); + + // *If* there is a truncation between an outer shift and a possibly-mask, + // then said truncation *must* be one-use, else we can't perform the fold. + Value *Trunc; + if (match(Masked, m_CombineAnd(m_Trunc(m_Value(Masked)), m_Value(Trunc))) && + !Trunc->hasOneUse()) + return nullptr; Type *NarrowestTy = OuterShift->getType(); Type *WidestTy = Masked->getType(); + bool HadTrunc = WidestTy != NarrowestTy; + // The mask must be computed in a type twice as wide to ensure // that no bits are lost if the sum-of-shifts is wider than the base type. Type *ExtendedTy = WidestTy->getExtendedType(); @@ -204,6 +196,14 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, Constant *NewMask; if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) { + // Peek through an optional zext of the shift amount. + match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt))); + + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now. + if (MaskShAmt->getType() != ShiftShAmt->getType()) + return nullptr; + // Can we simplify (MaskShAmt+ShiftShAmt) ? auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst( MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); @@ -216,7 +216,7 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, // completely unknown. Replace the the `undef` shift amounts with final // shift bitwidth to ensure that the value remains undef when creating the // subsequent shift op. - SumOfShAmts = replaceUndefsWith( + SumOfShAmts = Constant::replaceUndefsWith( SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(), ExtendedTy->getScalarSizeInBits())); auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy); @@ -228,6 +228,14 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) || match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)), m_Deferred(MaskShAmt)))) { + // Peek through an optional zext of the shift amount. + match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt))); + + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now. + if (MaskShAmt->getType() != ShiftShAmt->getType()) + return nullptr; + // Can we simplify (ShiftShAmt-MaskShAmt) ? auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst( ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); @@ -241,7 +249,7 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, // bitwidth of innermost shift to ensure that the value remains undef when // creating the subsequent shift op. unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits(); - ShAmtsDiff = replaceUndefsWith( + ShAmtsDiff = Constant::replaceUndefsWith( ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(), -WidestTyBitWidth)); auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt( @@ -272,10 +280,15 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, return nullptr; } + // If we need to apply truncation, let's do it first, since we can. + // We have already ensured that the old truncation will go away. + if (HadTrunc) + X = Builder.CreateTrunc(X, NarrowestTy); + // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits. + // We didn't change the Type of this outermost shift, so we can just do it. auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X, OuterShift->getOperand(1)); - if (!NeedMask) return NewShift; @@ -283,6 +296,50 @@ dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, return BinaryOperator::Create(Instruction::And, NewShift, NewMask); } +/// If we have a shift-by-constant of a bitwise logic op that itself has a +/// shift-by-constant operand with identical opcode, we may be able to convert +/// that into 2 independent shifts followed by the logic op. This eliminates a +/// a use of an intermediate value (reduces dependency chain). +static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.isShift() && "Expected a shift as input"); + auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0)); + if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse()) + return nullptr; + + const APInt *C0, *C1; + if (!match(I.getOperand(1), m_APInt(C1))) + return nullptr; + + Instruction::BinaryOps ShiftOpcode = I.getOpcode(); + Type *Ty = I.getType(); + + // Find a matching one-use shift by constant. The fold is not valid if the sum + // of the shift values equals or exceeds bitwidth. + // TODO: Remove the one-use check if the other logic operand (Y) is constant. + Value *X, *Y; + auto matchFirstShift = [&](Value *V) { + return !isa<ConstantExpr>(V) && + match(V, m_OneUse(m_Shift(m_Value(X), m_APInt(C0)))) && + cast<BinaryOperator>(V)->getOpcode() == ShiftOpcode && + (*C0 + *C1).ult(Ty->getScalarSizeInBits()); + }; + + // Logic ops are commutative, so check each operand for a match. + if (matchFirstShift(LogicInst->getOperand(0))) + Y = LogicInst->getOperand(1); + else if (matchFirstShift(LogicInst->getOperand(1))) + Y = LogicInst->getOperand(0); + else + return nullptr; + + // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1) + Constant *ShiftSumC = ConstantInt::get(Ty, *C0 + *C1); + Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC); + Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1)); + return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2); +} + Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); assert(Op0->getType() == Op1->getType()); @@ -335,6 +392,9 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { return &I; } + if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder)) + return Logic; + return nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index d30ab8001897..47ce83974c8d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -14,6 +14,8 @@ #include "InstCombineInternal.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" @@ -348,8 +350,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (ShrinkDemandedConstant(I, 1, DemandedMask) || - ShrinkDemandedConstant(I, 2, DemandedMask)) + // This is similar to ShrinkDemandedConstant, but for a select we want to + // try to keep the selected constants the same as icmp value constants, if + // we can. This helps not break apart (or helps put back together) + // canonical patterns like min and max. + auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo, + APInt DemandedMask) { + const APInt *SelC; + if (!match(I->getOperand(OpNo), m_APInt(SelC))) + return false; + + // Get the constant out of the ICmp, if there is one. + const APInt *CmpC; + ICmpInst::Predicate Pred; + if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) || + CmpC->getBitWidth() != SelC->getBitWidth()) + return ShrinkDemandedConstant(I, OpNo, DemandedMask); + + // If the constant is already the same as the ICmp, leave it as-is. + if (*CmpC == *SelC) + return false; + // If the constants are not already the same, but can be with the demand + // mask, use the constant value from the ICmp. + if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) { + I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC)); + return true; + } + return ShrinkDemandedConstant(I, OpNo, DemandedMask); + }; + if (CanonicalizeSelectConstant(I, 1, DemandedMask) || + CanonicalizeSelectConstant(I, 2, DemandedMask)) return I; // Only known if known in both the LHS and RHS. @@ -1247,30 +1277,57 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, break; } case Instruction::ShuffleVector: { - ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); - unsigned LHSVWidth = - Shuffle->getOperand(0)->getType()->getVectorNumElements(); - APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0); + auto *Shuffle = cast<ShuffleVectorInst>(I); + assert(Shuffle->getOperand(0)->getType() == + Shuffle->getOperand(1)->getType() && + "Expected shuffle operands to have same type"); + unsigned OpWidth = + Shuffle->getOperand(0)->getType()->getVectorNumElements(); + APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0); for (unsigned i = 0; i < VWidth; i++) { if (DemandedElts[i]) { unsigned MaskVal = Shuffle->getMaskValue(i); if (MaskVal != -1u) { - assert(MaskVal < LHSVWidth * 2 && + assert(MaskVal < OpWidth * 2 && "shufflevector mask index out of range!"); - if (MaskVal < LHSVWidth) + if (MaskVal < OpWidth) LeftDemanded.setBit(MaskVal); else - RightDemanded.setBit(MaskVal - LHSVWidth); + RightDemanded.setBit(MaskVal - OpWidth); } } } - APInt LHSUndefElts(LHSVWidth, 0); + APInt LHSUndefElts(OpWidth, 0); simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts); - APInt RHSUndefElts(LHSVWidth, 0); + APInt RHSUndefElts(OpWidth, 0); simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts); + // If this shuffle does not change the vector length and the elements + // demanded by this shuffle are an identity mask, then this shuffle is + // unnecessary. + // + // We are assuming canonical form for the mask, so the source vector is + // operand 0 and operand 1 is not used. + // + // Note that if an element is demanded and this shuffle mask is undefined + // for that element, then the shuffle is not considered an identity + // operation. The shuffle prevents poison from the operand vector from + // leaking to the result by replacing poison with an undefined value. + if (VWidth == OpWidth) { + bool IsIdentityShuffle = true; + for (unsigned i = 0; i < VWidth; i++) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (DemandedElts[i] && i != MaskVal) { + IsIdentityShuffle = false; + break; + } + } + if (IsIdentityShuffle) + return Shuffle->getOperand(0); + } + bool NewUndefElts = false; unsigned LHSIdx = -1u, LHSValIdx = -1u; unsigned RHSIdx = -1u, RHSValIdx = -1u; @@ -1283,23 +1340,23 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } else if (!DemandedElts[i]) { NewUndefElts = true; UndefElts.setBit(i); - } else if (MaskVal < LHSVWidth) { + } else if (MaskVal < OpWidth) { if (LHSUndefElts[MaskVal]) { NewUndefElts = true; UndefElts.setBit(i); } else { - LHSIdx = LHSIdx == -1u ? i : LHSVWidth; - LHSValIdx = LHSValIdx == -1u ? MaskVal : LHSVWidth; + LHSIdx = LHSIdx == -1u ? i : OpWidth; + LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth; LHSUniform = LHSUniform && (MaskVal == i); } } else { - if (RHSUndefElts[MaskVal - LHSVWidth]) { + if (RHSUndefElts[MaskVal - OpWidth]) { NewUndefElts = true; UndefElts.setBit(i); } else { - RHSIdx = RHSIdx == -1u ? i : LHSVWidth; - RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth; - RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i); + RHSIdx = RHSIdx == -1u ? i : OpWidth; + RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth; + RHSUniform = RHSUniform && (MaskVal - OpWidth == i); } } } @@ -1308,20 +1365,20 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // this constant vector to single insertelement instruction. // shufflevector V, C, <v1, v2, .., ci, .., vm> -> // insertelement V, C[ci], ci-n - if (LHSVWidth == Shuffle->getType()->getNumElements()) { + if (OpWidth == Shuffle->getType()->getNumElements()) { Value *Op = nullptr; Constant *Value = nullptr; unsigned Idx = -1u; // Find constant vector with the single element in shuffle (LHS or RHS). - if (LHSIdx < LHSVWidth && RHSUniform) { + if (LHSIdx < OpWidth && RHSUniform) { if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) { Op = Shuffle->getOperand(1); Value = CV->getOperand(LHSValIdx); Idx = LHSIdx; } } - if (RHSIdx < LHSVWidth && LHSUniform) { + if (RHSIdx < OpWidth && LHSUniform) { if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) { Op = Shuffle->getOperand(0); Value = CV->getOperand(RHSValIdx); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 9c890748e5ab..f604c9dc32ca 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1390,20 +1390,6 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { llvm_unreachable("failed to reorder elements of vector instruction!"); } -static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask, - bool &isLHSID, bool &isRHSID) { - isLHSID = isRHSID = true; - - for (unsigned i = 0, e = Mask.size(); i != e; ++i) { - if (Mask[i] < 0) continue; // Ignore undef values. - // Is this an identity shuffle of the LHS value? - isLHSID &= (Mask[i] == (int)i); - - // Is this an identity shuffle of the RHS value? - isRHSID &= (Mask[i]-e == i); - } -} - // Returns true if the shuffle is extracting a contiguous range of values from // LHS, for example: // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ @@ -1560,9 +1546,11 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, if (!Shuf.isSelect()) return nullptr; - // Canonicalize to choose from operand 0 first. + // Canonicalize to choose from operand 0 first unless operand 1 is undefined. + // Commuting undef to operand 0 conflicts with another canonicalization. unsigned NumElts = Shuf.getType()->getVectorNumElements(); - if (Shuf.getMaskValue(0) >= (int)NumElts) { + if (!isa<UndefValue>(Shuf.getOperand(1)) && + Shuf.getMaskValue(0) >= (int)NumElts) { // TODO: Can we assert that both operands of a shuffle-select are not undef // (otherwise, it would have been folded by instsimplify? Shuf.commute(); @@ -1753,7 +1741,8 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); } -/// Try to replace a shuffle with an insertelement. +/// Try to replace a shuffle with an insertelement or try to replace a shuffle +/// operand with the operand of an insertelement. static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) { Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1); SmallVector<int, 16> Mask = Shuf.getShuffleMask(); @@ -1765,6 +1754,31 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) { if (NumElts != (int)(V0->getType()->getVectorNumElements())) return nullptr; + // This is a specialization of a fold in SimplifyDemandedVectorElts. We may + // not be able to handle it there if the insertelement has >1 use. + // If the shuffle has an insertelement operand but does not choose the + // inserted scalar element from that value, then we can replace that shuffle + // operand with the source vector of the insertelement. + Value *X; + uint64_t IdxC; + if (match(V0, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { + // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask + if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) { + Shuf.setOperand(0, X); + return &Shuf; + } + } + if (match(V1, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { + // Offset the index constant by the vector width because we are checking for + // accesses to the 2nd vector input of the shuffle. + IdxC += NumElts; + // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask + if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) { + Shuf.setOperand(1, X); + return &Shuf; + } + } + // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC' auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) { // We need an insertelement with a constant index. @@ -1891,29 +1905,21 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI))) return replaceInstUsesWith(SVI, V); - // Canonicalize shuffle(x ,x,mask) -> shuffle(x, undef,mask') - // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask'). + // shuffle x, x, mask --> shuffle x, undef, mask' unsigned VWidth = SVI.getType()->getVectorNumElements(); unsigned LHSWidth = LHS->getType()->getVectorNumElements(); SmallVector<int, 16> Mask = SVI.getShuffleMask(); Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); - if (LHS == RHS || isa<UndefValue>(LHS)) { + if (LHS == RHS) { + assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?"); // Remap any references to RHS to use LHS. SmallVector<Constant*, 16> Elts; - for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { - if (Mask[i] < 0) { - Elts.push_back(UndefValue::get(Int32Ty)); - continue; - } - - if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) || - (Mask[i] < (int)e && isa<UndefValue>(LHS))) { - Mask[i] = -1; // Turn into undef. + for (unsigned i = 0; i != VWidth; ++i) { + // Propagate undef elements or force mask to LHS. + if (Mask[i] < 0) Elts.push_back(UndefValue::get(Int32Ty)); - } else { - Mask[i] = Mask[i] % e; // Force to LHS. - Elts.push_back(ConstantInt::get(Int32Ty, Mask[i])); - } + else + Elts.push_back(ConstantInt::get(Int32Ty, Mask[i] % LHSWidth)); } SVI.setOperand(0, SVI.getOperand(1)); SVI.setOperand(1, UndefValue::get(RHS->getType())); @@ -1921,6 +1927,12 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return &SVI; } + // shuffle undef, x, mask --> shuffle x, undef, mask' + if (isa<UndefValue>(LHS)) { + SVI.commute(); + return &SVI; + } + if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) return I; @@ -1948,16 +1960,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (Instruction *I = foldIdentityPaddedShuffles(SVI)) return I; - if (VWidth == LHSWidth) { - // Analyze the shuffle, are the LHS or RHS and identity shuffles? - bool isLHSID, isRHSID; - recognizeIdentityMask(Mask, isLHSID, isRHSID); - - // Eliminate identity shuffles. - if (isLHSID) return replaceInstUsesWith(SVI, LHS); - if (isRHSID) return replaceInstUsesWith(SVI, RHS); - } - if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) { Value *V = evaluateInDifferentElementOrder(LHS, Mask); return replaceInstUsesWith(SVI, V); @@ -2235,12 +2237,5 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts)); } - // If the result mask is an identity, replace uses of this instruction with - // corresponding argument. - bool isLHSID, isRHSID; - recognizeIdentityMask(newMask, isLHSID, isRHSID); - if (isLHSID && VWidth == LHSOp0Width) return replaceInstUsesWith(SVI, newLHS); - if (isRHSID && VWidth == RHSOp0Width) return replaceInstUsesWith(SVI, newRHS); - return MadeChange ? &SVI : nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index ecb486c544e0..801c09a317a7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -86,6 +86,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CBindingWrapping.h" #include "llvm/Support/Casting.h" @@ -121,6 +122,9 @@ STATISTIC(NumReassoc , "Number of reassociations"); DEBUG_COUNTER(VisitCounter, "instcombine-visit", "Controls which instructions are visited"); +static constexpr unsigned InstCombineDefaultMaxIterations = 1000; +static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000; + static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), cl::init(true)); @@ -129,6 +133,17 @@ static cl::opt<bool> EnableExpensiveCombines("expensive-combines", cl::desc("Enable expensive instruction combines")); +static cl::opt<unsigned> LimitMaxIterations( + "instcombine-max-iterations", + cl::desc("Limit the maximum number of instruction combining iterations"), + cl::init(InstCombineDefaultMaxIterations)); + +static cl::opt<unsigned> InfiniteLoopDetectionThreshold( + "instcombine-infinite-loop-threshold", + cl::desc("Number of instruction combining iterations considered an " + "infinite loop"), + cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden); + static cl::opt<unsigned> MaxArraySize("instcombine-maxarray-size", cl::init(1024), cl::desc("Maximum array size considered when doing a combine")); @@ -759,35 +774,52 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS, Value *RHS) { - Instruction::BinaryOps Opcode = I.getOpcode(); - // (op (select (a, b, c)), (select (a, d, e))) -> (select (a, (op b, d), (op - // c, e))) - Value *A, *B, *C, *D, *E; - Value *SI = nullptr; - if (match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))) && - match(RHS, m_Select(m_Specific(A), m_Value(D), m_Value(E)))) { - bool SelectsHaveOneUse = LHS->hasOneUse() && RHS->hasOneUse(); - - FastMathFlags FMF; - BuilderTy::FastMathFlagGuard Guard(Builder); - if (isa<FPMathOperator>(&I)) { - FMF = I.getFastMathFlags(); - Builder.setFastMathFlags(FMF); - } + Value *A, *B, *C, *D, *E, *F; + bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))); + bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F))); + if (!LHSIsSelect && !RHSIsSelect) + return nullptr; - Value *V1 = SimplifyBinOp(Opcode, C, E, FMF, SQ.getWithInstruction(&I)); - Value *V2 = SimplifyBinOp(Opcode, B, D, FMF, SQ.getWithInstruction(&I)); - if (V1 && V2) - SI = Builder.CreateSelect(A, V2, V1); - else if (V2 && SelectsHaveOneUse) - SI = Builder.CreateSelect(A, V2, Builder.CreateBinOp(Opcode, C, E)); - else if (V1 && SelectsHaveOneUse) - SI = Builder.CreateSelect(A, Builder.CreateBinOp(Opcode, B, D), V1); + FastMathFlags FMF; + BuilderTy::FastMathFlagGuard Guard(Builder); + if (isa<FPMathOperator>(&I)) { + FMF = I.getFastMathFlags(); + Builder.setFastMathFlags(FMF); + } - if (SI) - SI->takeName(&I); + Instruction::BinaryOps Opcode = I.getOpcode(); + SimplifyQuery Q = SQ.getWithInstruction(&I); + + Value *Cond, *True = nullptr, *False = nullptr; + if (LHSIsSelect && RHSIsSelect && A == D) { + // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F) + Cond = A; + True = SimplifyBinOp(Opcode, B, E, FMF, Q); + False = SimplifyBinOp(Opcode, C, F, FMF, Q); + + if (LHS->hasOneUse() && RHS->hasOneUse()) { + if (False && !True) + True = Builder.CreateBinOp(Opcode, B, E); + else if (True && !False) + False = Builder.CreateBinOp(Opcode, C, F); + } + } else if (LHSIsSelect && LHS->hasOneUse()) { + // (A ? B : C) op Y -> A ? (B op Y) : (C op Y) + Cond = A; + True = SimplifyBinOp(Opcode, B, RHS, FMF, Q); + False = SimplifyBinOp(Opcode, C, RHS, FMF, Q); + } else if (RHSIsSelect && RHS->hasOneUse()) { + // X op (D ? E : F) -> D ? (X op E) : (X op F) + Cond = D; + True = SimplifyBinOp(Opcode, LHS, E, FMF, Q); + False = SimplifyBinOp(Opcode, LHS, F, FMF, Q); } + if (!True || !False) + return nullptr; + + Value *SI = Builder.CreateSelect(Cond, True, False); + SI->takeName(&I); return SI; } @@ -1526,11 +1558,13 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) { // If this is a widening shuffle, we must be able to extend with undef // elements. If the original binop does not produce an undef in the high // lanes, then this transform is not safe. + // Similarly for undef lanes due to the shuffle mask, we can only + // transform binops that preserve undef. // TODO: We could shuffle those non-undef constant values into the // result by using a constant vector (rather than an undef vector) // as operand 1 of the new binop, but that might be too aggressive // for target-independent shuffle creation. - if (I >= SrcVecNumElts) { + if (I >= SrcVecNumElts || ShMask[I] < 0) { Constant *MaybeUndef = ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt) : ConstantExpr::get(Opcode, CElt, UndefScalar); @@ -1615,6 +1649,15 @@ Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) { return CastInst::Create(CastOpc, NarrowBO, BO.getType()); } +static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) { + // At least one GEP must be inbounds. + if (!GEP1.isInBounds() && !GEP2.isInBounds()) + return false; + + return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) && + (GEP2.isInBounds() || GEP2.hasAllZeroIndices()); +} + Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end()); Type *GEPType = GEP.getType(); @@ -1724,8 +1767,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // The first two arguments can vary for any GEP, the rest have to be // static for struct slots - if (J > 1 && CurTy->isStructTy()) - return nullptr; + if (J > 1) { + assert(CurTy && "No current type?"); + if (CurTy->isStructTy()) + return nullptr; + } DI = J; } else { @@ -1885,6 +1931,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Update the GEP in place if possible. if (Src->getNumOperands() == 2) { + GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))); GEP.setOperand(0, Src->getOperand(0)); GEP.setOperand(1, Sum); return &GEP; @@ -1901,7 +1948,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } if (!Indices.empty()) - return GEP.isInBounds() && Src->isInBounds() + return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)) ? GetElementPtrInst::CreateInBounds( Src->getSourceElementType(), Src->getOperand(0), Indices, GEP.getName()) @@ -2154,15 +2201,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // of a bitcasted pointer to vector or array of the same dimensions: // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z - auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy) { + auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy, + const DataLayout &DL) { return ArrTy->getArrayElementType() == VecTy->getVectorElementType() && - ArrTy->getArrayNumElements() == VecTy->getVectorNumElements(); + ArrTy->getArrayNumElements() == VecTy->getVectorNumElements() && + DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy); }; if (GEP.getNumOperands() == 3 && ((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() && - areMatchingArrayAndVecTypes(GEPEltType, SrcEltType)) || + areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) || (GEPEltType->isVectorTy() && SrcEltType->isArrayTy() && - areMatchingArrayAndVecTypes(SrcEltType, GEPEltType)))) { + areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) { // Create a new GEP here, as using `setOperand()` followed by // `setSourceElementType()` won't actually update the type of the @@ -2401,12 +2450,13 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { replaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()), C->isFalseWhenEqual())); - } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) || - isa<AddrSpaceCastInst>(I)) { - replaceInstUsesWith(*I, UndefValue::get(I->getType())); } else if (auto *SI = dyn_cast<StoreInst>(I)) { for (auto *DII : DIIs) ConvertDebugDeclareToDebugValue(DII, SI, *DIB); + } else { + // Casts, GEP, or anything else: we're about to delete this instruction, + // so it can not have any valid uses. + replaceInstUsesWith(*I, UndefValue::get(I->getType())); } eraseInstFromFunction(*I); } @@ -3111,6 +3161,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return nullptr; } +Instruction *InstCombiner::visitFreeze(FreezeInst &I) { + Value *Op0 = I.getOperand(0); + + if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + return nullptr; +} + /// Try to move the specified instruction from its current block into the /// beginning of DestBlock, which can only happen if it's safe to move the /// instruction past all of the instructions between it and the end of its @@ -3322,10 +3381,6 @@ bool InstCombiner::run() { // Move the name to the new instruction first. Result->takeName(I); - // Push the new instruction and any users onto the worklist. - Worklist.AddUsersToWorkList(*Result); - Worklist.Add(Result); - // Insert the new instruction into the basic block... BasicBlock *InstParent = I->getParent(); BasicBlock::iterator InsertPos = I->getIterator(); @@ -3337,6 +3392,10 @@ bool InstCombiner::run() { InstParent->getInstList().insert(InsertPos, Result); + // Push the new instruction and any users onto the worklist. + Worklist.AddUsersToWorkList(*Result); + Worklist.Add(Result); + eraseInstFromFunction(*I); } else { LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' @@ -3392,8 +3451,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, if (isInstructionTriviallyDead(Inst, TLI)) { ++NumDeadInst; LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); - if (!salvageDebugInfo(*Inst)) - replaceDbgUsesWithUndef(Inst); + salvageDebugInfoOrMarkUndef(*Inst); Inst->eraseFromParent(); MadeIRChange = true; continue; @@ -3507,10 +3565,11 @@ static bool combineInstructionsOverFunction( Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, bool ExpensiveCombines = true, - LoopInfo *LI = nullptr) { + ProfileSummaryInfo *PSI, bool ExpensiveCombines, unsigned MaxIterations, + LoopInfo *LI) { auto &DL = F.getParent()->getDataLayout(); ExpensiveCombines |= EnableExpensiveCombines; + MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue()); /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. @@ -3529,9 +3588,23 @@ static bool combineInstructionsOverFunction( MadeIRChange = LowerDbgDeclare(F); // Iterate while there is work to do. - int Iteration = 0; + unsigned Iteration = 0; while (true) { ++Iteration; + + if (Iteration > InfiniteLoopDetectionThreshold) { + report_fatal_error( + "Instruction Combining seems stuck in an infinite loop after " + + Twine(InfiniteLoopDetectionThreshold) + " iterations."); + } + + if (Iteration > MaxIterations) { + LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations + << " on " << F.getName() + << " reached; stopping before reaching a fixpoint\n"); + break; + } + LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); @@ -3543,11 +3616,19 @@ static bool combineInstructionsOverFunction( if (!IC.run()) break; + + MadeIRChange = true; } - return MadeIRChange || Iteration > 1; + return MadeIRChange; } +InstCombinePass::InstCombinePass(bool ExpensiveCombines) + : ExpensiveCombines(ExpensiveCombines), MaxIterations(LimitMaxIterations) {} + +InstCombinePass::InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations) + : ExpensiveCombines(ExpensiveCombines), MaxIterations(MaxIterations) {} + PreservedAnalyses InstCombinePass::run(Function &F, FunctionAnalysisManager &AM) { auto &AC = AM.getResult<AssumptionAnalysis>(F); @@ -3565,8 +3646,9 @@ PreservedAnalyses InstCombinePass::run(Function &F, auto *BFI = (PSI && PSI->hasProfileSummary()) ? &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr; - if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - BFI, PSI, ExpensiveCombines, LI)) + if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, + PSI, ExpensiveCombines, MaxIterations, + LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3615,12 +3697,26 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() : nullptr; - return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - BFI, PSI, ExpensiveCombines, LI); + return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, + PSI, ExpensiveCombines, MaxIterations, + LI); } char InstructionCombiningPass::ID = 0; +InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines) + : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines), + MaxIterations(InstCombineDefaultMaxIterations) { + initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); +} + +InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines, + unsigned MaxIterations) + : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines), + MaxIterations(MaxIterations) { + initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); +} + INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) @@ -3647,6 +3743,11 @@ FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) { return new InstructionCombiningPass(ExpensiveCombines); } +FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines, + unsigned MaxIterations) { + return new InstructionCombiningPass(ExpensiveCombines, MaxIterations); +} + void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createInstructionCombiningPass()); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index d92ee11c2e1a..79c119489a65 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -59,6 +59,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -1808,6 +1809,8 @@ bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) { if (GlobalsMD.get(G).IsBlacklisted) return false; if (!Ty->isSized()) return false; if (!G->hasInitializer()) return false; + // Only instrument globals of default address spaces + if (G->getAddressSpace()) return false; if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals. // Two problems with thread-locals: // - The address of the main thread's copy can't be computed at link-time. @@ -2898,15 +2901,14 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() { for (Argument &Arg : F.args()) { if (Arg.hasByValAttr()) { Type *Ty = Arg.getType()->getPointerElementType(); - unsigned Alignment = Arg.getParamAlignment(); - if (Alignment == 0) - Alignment = DL.getABITypeAlignment(Ty); + const Align Alignment = + DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty); AllocaInst *AI = IRB.CreateAlloca( Ty, nullptr, (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) + ".byval"); - AI->setAlignment(Align(Alignment)); + AI->setAlignment(Alignment); Arg.replaceAllUsesWith(AI); uint64_t AllocSize = DL.getTypeAllocSize(Ty); @@ -2993,7 +2995,6 @@ void FunctionStackPoisoner::processStaticAllocas() { Instruction *InsBefore = AllocaVec[0]; IRBuilder<> IRB(InsBefore); - IRB.SetCurrentDebugLocation(EntryDebugLocation); // Make sure non-instrumented allocas stay in the entry block. Otherwise, // debug info is broken, because only entry-block allocas are treated as @@ -3088,14 +3089,12 @@ void FunctionStackPoisoner::processStaticAllocas() { Instruction *Term = SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false); IRBuilder<> IRBIf(Term); - IRBIf.SetCurrentDebugLocation(EntryDebugLocation); StackMallocIdx = StackMallocSizeClass(LocalStackSize); assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass); Value *FakeStackValue = IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx], ConstantInt::get(IntptrTy, LocalStackSize)); IRB.SetInsertPoint(InsBefore); - IRB.SetCurrentDebugLocation(EntryDebugLocation); FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term, ConstantInt::get(IntptrTy, 0)); @@ -3103,14 +3102,11 @@ void FunctionStackPoisoner::processStaticAllocas() { IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy)); Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false); IRBIf.SetInsertPoint(Term); - IRBIf.SetCurrentDebugLocation(EntryDebugLocation); Value *AllocaValue = DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca; IRB.SetInsertPoint(InsBefore); - IRB.SetCurrentDebugLocation(EntryDebugLocation); LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack); - IRB.SetCurrentDebugLocation(EntryDebugLocation); IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca); DIExprFlags |= DIExpression::DerefBefore; } else { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index ae34be986537..9abb62ac788c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 55c64fa4b727..d35abb92dd08 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -27,7 +27,9 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -623,6 +625,10 @@ static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb, assert(SumWt >= TrueWt && SumWt >= FalseWt && "Overflow calculating branch probabilities."); + // Guard against 0-to-0 branch weights to avoid a division-by-zero crash. + if (SumWt == 0) + return false; + TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt); FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt); return true; @@ -1061,6 +1067,7 @@ static bool shouldSplit(Instruction *InsertPoint, DenseSet<Value *> &ConditionValues, DominatorTree &DT, DenseSet<Instruction *> &Unhoistables) { + assert(InsertPoint && "Null InsertPoint"); CHR_DEBUG( dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues "; for (Value *V : PrevConditionValues) { @@ -1071,7 +1078,6 @@ static bool shouldSplit(Instruction *InsertPoint, dbgs() << *V << ", "; } dbgs() << "\n"); - assert(InsertPoint && "Null InsertPoint"); // If any of Bases isn't hoistable to the hoist point, split. for (Value *V : ConditionValues) { DenseMap<Instruction *, bool> Visited; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index c0353cba0b2f..cf9a6a321c7a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -55,7 +55,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -83,13 +82,16 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SpecialCaseList.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -479,7 +481,9 @@ DataFlowSanitizer::DataFlowSanitizer( std::vector<std::string> AllABIListFiles(std::move(ABIListFiles)); AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), ClABIListFiles.end()); - ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles)); + // FIXME: should we propagate vfs::FileSystem to this constructor? + ABIList.set( + SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem())); } FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index ac6082441eae..bf3e4ed3e31f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -128,9 +129,9 @@ private: // Checksum, produced by hash of EdgeDestinations SmallVector<uint32_t, 4> FileChecksums; - Module *M; + Module *M = nullptr; std::function<const TargetLibraryInfo &(Function &F)> GetTLI; - LLVMContext *Ctx; + LLVMContext *Ctx = nullptr; SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs; std::vector<Regex> FilterRe; std::vector<Regex> ExcludeRe; @@ -384,7 +385,7 @@ namespace { return EdgeDestinations; } - uint32_t getFuncChecksum() { + uint32_t getFuncChecksum() const { return FuncChecksum; } @@ -713,7 +714,10 @@ void GCOVProfiler::emitProfileNotes() { // to have a counter for the function definition. uint32_t Line = SP->getLine(); auto Filename = getFilename(SP); - Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line); + + // Artificial functions such as global initializers + if (!SP->isArtificial()) + Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line); for (auto &BB : F) { GCOVBlock &Block = Func.getBlock(&BB); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index f87132ee4758..7e8f8e27a97b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -221,7 +222,7 @@ public: Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong); bool instrumentStack( SmallVectorImpl<AllocaInst *> &Allocas, - DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap, + DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap, SmallVectorImpl<Instruction *> &RetVec, Value *StackTag); Value *readRegister(IRBuilder<> &IRB, StringRef Name); bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec); @@ -284,7 +285,6 @@ private: FunctionCallee HwasanTagMemoryFunc; FunctionCallee HwasanGenerateTagFunc; - FunctionCallee HwasanThreadEnterFunc; Constant *ShadowGlobal; @@ -473,9 +473,6 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { HWAsanHandleVfork = M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy); - - HwasanThreadEnterFunc = - M.getOrInsertFunction("__hwasan_thread_enter", IRB.getVoidTy()); } Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) { @@ -792,7 +789,7 @@ bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, // llvm.memset right here into either a sequence of stores, or a call to // hwasan_tag_memory. if (ShadowSize) - IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1); + IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align::None()); if (Size != AlignedSize) { IRB.CreateStore( ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()), @@ -934,34 +931,13 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy); assert(SlotPtr); - Instruction *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); - - Function *F = IRB.GetInsertBlock()->getParent(); - if (F->getFnAttribute("hwasan-abi").getValueAsString() == "interceptor") { - Value *ThreadLongEqZero = - IRB.CreateICmpEQ(ThreadLong, ConstantInt::get(IntptrTy, 0)); - auto *Br = cast<BranchInst>(SplitBlockAndInsertIfThen( - ThreadLongEqZero, cast<Instruction>(ThreadLongEqZero)->getNextNode(), - false, MDBuilder(*C).createBranchWeights(1, 100000))); - - IRB.SetInsertPoint(Br); - // FIXME: This should call a new runtime function with a custom calling - // convention to avoid needing to spill all arguments here. - IRB.CreateCall(HwasanThreadEnterFunc); - LoadInst *ReloadThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); - - IRB.SetInsertPoint(&*Br->getSuccessor(0)->begin()); - PHINode *ThreadLongPhi = IRB.CreatePHI(IntptrTy, 2); - ThreadLongPhi->addIncoming(ThreadLong, ThreadLong->getParent()); - ThreadLongPhi->addIncoming(ReloadThreadLong, ReloadThreadLong->getParent()); - ThreadLong = ThreadLongPhi; - } - + Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI. Value *ThreadLongMaybeUntagged = TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong); if (WithFrameRecord) { + Function *F = IRB.GetInsertBlock()->getParent(); StackBaseTag = IRB.CreateAShr(ThreadLong, 3); // Prepare ring buffer data. @@ -1040,7 +1016,7 @@ bool HWAddressSanitizer::instrumentLandingPads( bool HWAddressSanitizer::instrumentStack( SmallVectorImpl<AllocaInst *> &Allocas, - DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap, + DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap, SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) { // Ideally, we want to calculate tagged stack base pointer, and rewrite all // alloca addresses using that. Unfortunately, offsets are not known yet @@ -1062,11 +1038,15 @@ bool HWAddressSanitizer::instrumentStack( AI->replaceUsesWithIf(Replacement, [AILong](Use &U) { return U.getUser() != AILong; }); - for (auto *DDI : AllocaDeclareMap.lookup(AI)) { - DIExpression *OldExpr = DDI->getExpression(); - DIExpression *NewExpr = DIExpression::append( - OldExpr, {dwarf::DW_OP_LLVM_tag_offset, RetagMask(N)}); - DDI->setArgOperand(2, MetadataAsValue::get(*C, NewExpr)); + for (auto *DDI : AllocaDbgMap.lookup(AI)) { + // Prepend "tag_offset, N" to the dwarf expression. + // Tag offset logically applies to the alloca pointer, and it makes sense + // to put it at the beginning of the expression. + SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset, + RetagMask(N)}; + DDI->setArgOperand( + 2, MetadataAsValue::get(*C, DIExpression::prependOpcodes( + DDI->getExpression(), NewOps))); } size_t Size = getAllocaSizeInBytes(*AI); @@ -1113,7 +1093,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { SmallVector<AllocaInst*, 8> AllocasToInstrument; SmallVector<Instruction*, 8> RetVec; SmallVector<Instruction*, 8> LandingPadVec; - DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> AllocaDeclareMap; + DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap; for (auto &BB : F) { for (auto &Inst : BB) { if (ClInstrumentStack) @@ -1127,9 +1107,10 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { isa<CleanupReturnInst>(Inst)) RetVec.push_back(&Inst); - if (auto *DDI = dyn_cast<DbgDeclareInst>(&Inst)) - if (auto *Alloca = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) - AllocaDeclareMap[Alloca].push_back(DDI); + if (auto *DDI = dyn_cast<DbgVariableIntrinsic>(&Inst)) + if (auto *Alloca = + dyn_cast_or_null<AllocaInst>(DDI->getVariableLocation())) + AllocaDbgMap[Alloca].push_back(DDI); if (InstrumentLandingPads && isa<LandingPadInst>(Inst)) LandingPadVec.push_back(&Inst); @@ -1172,7 +1153,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { if (!AllocasToInstrument.empty()) { Value *StackTag = ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB); - Changed |= instrumentStack(AllocasToInstrument, AllocaDeclareMap, RetVec, + Changed |= instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index 74d6e76eceb6..d5787c8f62a1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp index 93d3a8a14d5c..518b8895e836 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -19,6 +20,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/ProfileData/InstrProf.h" @@ -28,7 +30,6 @@ #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" -#include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include <fstream> #include <map> #include <mutex> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 1f092a5f3103..04c7e856b5d4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 69c9020e060b..80acab307578 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -170,12 +170,14 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" @@ -202,8 +204,8 @@ using namespace llvm; #define DEBUG_TYPE "msan" static const unsigned kOriginSize = 4; -static const unsigned kMinOriginAlignment = 4; -static const unsigned kShadowTLSAlignment = 8; +static const Align kMinOriginAlignment = Align(4); +static const Align kShadowTLSAlignment = Align(8); // These constants must be kept in sync with the ones in msan.h. static const unsigned kParamTLSSize = 800; @@ -1086,15 +1088,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// Fill memory range with the given origin value. void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr, - unsigned Size, unsigned Alignment) { + unsigned Size, Align Alignment) { const DataLayout &DL = F.getParent()->getDataLayout(); - unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy); + const Align IntptrAlignment = Align(DL.getABITypeAlignment(MS.IntptrTy)); unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy); assert(IntptrAlignment >= kMinOriginAlignment); assert(IntptrSize >= kOriginSize); unsigned Ofs = 0; - unsigned CurrentAlignment = Alignment; + Align CurrentAlignment = Alignment; if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) { Value *IntptrOrigin = originToIntptr(IRB, Origin); Value *IntptrOriginPtr = @@ -1102,7 +1104,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { for (unsigned i = 0; i < Size / IntptrSize; ++i) { Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i) : IntptrOriginPtr; - IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment); + IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment.value()); Ofs += IntptrSize / kOriginSize; CurrentAlignment = IntptrAlignment; } @@ -1111,23 +1113,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) { Value *GEP = i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr; - IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment); + IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment.value()); CurrentAlignment = kMinOriginAlignment; } } void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin, - Value *OriginPtr, unsigned Alignment, bool AsCall) { + Value *OriginPtr, Align Alignment, bool AsCall) { const DataLayout &DL = F.getParent()->getDataLayout(); - unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); + const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); if (Shadow->getType()->isAggregateType()) { paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, OriginAlignment); } else { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); - if (ConstantShadow) { + if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) { if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, OriginAlignment); @@ -1164,12 +1165,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val); Value *ShadowPtr, *OriginPtr; Type *ShadowTy = Shadow->getType(); - unsigned Alignment = SI->getAlignment(); - unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); + const Align Alignment = assumeAligned(SI->getAlignment()); + const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true); - StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment); + StoreInst *NewSI = + IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment.value()); LLVM_DEBUG(dbgs() << " STORE: " << *NewSI << "\n"); (void)NewSI; @@ -1207,8 +1209,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); LLVM_DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); - Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); - if (ConstantShadow) { + if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) { if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) { insertWarningFn(IRB, Origin); } @@ -1403,10 +1404,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// /// Shadow = ShadowBase + Offset /// Origin = (OriginBase + Offset) & ~3ULL - std::pair<Value *, Value *> getShadowOriginPtrUserspace(Value *Addr, - IRBuilder<> &IRB, - Type *ShadowTy, - unsigned Alignment) { + std::pair<Value *, Value *> + getShadowOriginPtrUserspace(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, + MaybeAlign Alignment) { Value *ShadowOffset = getShadowPtrOffset(Addr, IRB); Value *ShadowLong = ShadowOffset; uint64_t ShadowBase = MS.MapParams->ShadowBase; @@ -1424,8 +1424,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (OriginBase != 0) OriginLong = IRB.CreateAdd(OriginLong, ConstantInt::get(MS.IntptrTy, OriginBase)); - if (Alignment < kMinOriginAlignment) { - uint64_t Mask = kMinOriginAlignment - 1; + if (!Alignment || *Alignment < kMinOriginAlignment) { + uint64_t Mask = kMinOriginAlignment.value() - 1; OriginLong = IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask)); } @@ -1435,9 +1435,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return std::make_pair(ShadowPtr, OriginPtr); } - std::pair<Value *, Value *> - getShadowOriginPtrKernel(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, - unsigned Alignment, bool isStore) { + std::pair<Value *, Value *> getShadowOriginPtrKernel(Value *Addr, + IRBuilder<> &IRB, + Type *ShadowTy, + bool isStore) { Value *ShadowOriginPtrs; const DataLayout &DL = F.getParent()->getDataLayout(); int Size = DL.getTypeStoreSize(ShadowTy); @@ -1462,14 +1463,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, - unsigned Alignment, + MaybeAlign Alignment, bool isStore) { - std::pair<Value *, Value *> ret; if (MS.CompileKernel) - ret = getShadowOriginPtrKernel(Addr, IRB, ShadowTy, Alignment, isStore); - else - ret = getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment); - return ret; + return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore); + return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment); } /// Compute the shadow address for a given function argument. @@ -1619,11 +1617,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // ByVal pointer itself has clean shadow. We copy the actual // argument shadow to the underlying memory. // Figure out maximal valid memcpy alignment. - unsigned ArgAlign = FArg.getParamAlignment(); - if (ArgAlign == 0) { - Type *EltType = A->getType()->getPointerElementType(); - ArgAlign = DL.getABITypeAlignment(EltType); - } + const Align ArgAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(FArg.getParamAlignment()), + A->getType()->getPointerElementType()); Value *CpShadowPtr = getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign, /*isStore*/ true) @@ -1635,7 +1631,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()), Size, ArgAlign); } else { - unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); + const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base, CopyAlign, Size); LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n"); @@ -1647,8 +1643,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // ParamTLS overflow. *ShadowPtr = getCleanShadow(V); } else { - *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base, - kShadowTLSAlignment); + *ShadowPtr = EntryIRB.CreateAlignedLoad( + getShadowTy(&FArg), Base, kShadowTLSAlignment.value()); } } LLVM_DEBUG(dbgs() @@ -1782,13 +1778,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(I.getNextNode()); Type *ShadowTy = getShadowTy(&I); Value *Addr = I.getPointerOperand(); - Value *ShadowPtr, *OriginPtr; - unsigned Alignment = I.getAlignment(); + Value *ShadowPtr = nullptr, *OriginPtr = nullptr; + const Align Alignment = assumeAligned(I.getAlignment()); if (PropagateShadow) { std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); - setShadow(&I, - IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld")); + setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, + Alignment.value(), "_msld")); } else { setShadow(&I, getCleanShadow(&I)); } @@ -1801,9 +1797,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (MS.TrackOrigins) { if (PropagateShadow) { - unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); - setOrigin( - &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment)); + const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); + setOrigin(&I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, + OriginAlignment.value())); } else { setOrigin(&I, getCleanOrigin()); } @@ -1825,8 +1821,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *Addr = I.getOperand(0); - Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), - /*Alignment*/ 1, /*isStore*/ true) + Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align::None(), + /*isStore*/ true) .first; if (ClCheckAccessAddress) @@ -2458,7 +2454,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // We don't know the pointer alignment (could be unaligned SSE store!). // Have to assume to worst case. std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr( - Addr, IRB, Shadow->getType(), /*Alignment*/ 1, /*isStore*/ true); + Addr, IRB, Shadow->getType(), Align::None(), /*isStore*/ true); IRB.CreateAlignedStore(Shadow, ShadowPtr, 1); if (ClCheckAccessAddress) @@ -2478,15 +2474,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Addr = I.getArgOperand(0); Type *ShadowTy = getShadowTy(&I); - Value *ShadowPtr, *OriginPtr; + Value *ShadowPtr = nullptr, *OriginPtr = nullptr; if (PropagateShadow) { // We don't know the pointer alignment (could be unaligned SSE load!). // Have to assume to worst case. - unsigned Alignment = 1; + const Align Alignment = Align::None(); std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); - setShadow(&I, - IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld")); + setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, + Alignment.value(), "_msld")); } else { setShadow(&I, getCleanShadow(&I)); } @@ -2873,7 +2869,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value* Addr = I.getArgOperand(0); Type *Ty = IRB.getInt32Ty(); Value *ShadowPtr = - getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1, /*isStore*/ true) + getShadowOriginPtr(Addr, IRB, Ty, Align::None(), /*isStore*/ true) .first; IRB.CreateStore(getCleanShadow(Ty), @@ -2889,7 +2885,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *Addr = I.getArgOperand(0); Type *Ty = IRB.getInt32Ty(); - unsigned Alignment = 1; + const Align Alignment = Align::None(); Value *ShadowPtr, *OriginPtr; std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false); @@ -2897,7 +2893,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (ClCheckAccessAddress) insertShadowCheck(Addr, &I); - Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr"); + Value *Shadow = + IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment.value(), "_ldmxcsr"); Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr) : getCleanOrigin(); insertShadowCheck(Shadow, Origin, &I); @@ -2907,14 +2904,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *V = I.getArgOperand(0); Value *Addr = I.getArgOperand(1); - unsigned Align = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); + const MaybeAlign Alignment( + cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()); Value *Mask = I.getArgOperand(3); Value *Shadow = getShadow(V); Value *ShadowPtr; Value *OriginPtr; std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr( - Addr, IRB, Shadow->getType(), Align, /*isStore*/ true); + Addr, IRB, Shadow->getType(), Alignment, /*isStore*/ true); if (ClCheckAccessAddress) { insertShadowCheck(Addr, &I); @@ -2923,20 +2921,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { insertShadowCheck(Mask, &I); } - IRB.CreateMaskedStore(Shadow, ShadowPtr, Align, Mask); + IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment ? Alignment->value() : 0, + Mask); if (MS.TrackOrigins) { auto &DL = F.getParent()->getDataLayout(); paintOrigin(IRB, getOrigin(V), OriginPtr, DL.getTypeStoreSize(Shadow->getType()), - std::max(Align, kMinOriginAlignment)); + llvm::max(Alignment, kMinOriginAlignment)); } } bool handleMaskedLoad(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Addr = I.getArgOperand(0); - unsigned Align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + const MaybeAlign Alignment( + cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); Value *Mask = I.getArgOperand(2); Value *PassThru = I.getArgOperand(3); @@ -2944,9 +2944,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ShadowPtr, *OriginPtr; if (PropagateShadow) { std::tie(ShadowPtr, OriginPtr) = - getShadowOriginPtr(Addr, IRB, ShadowTy, Align, /*isStore*/ false); - setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Align, Mask, - getShadow(PassThru), "_msmaskedld")); + getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); + setShadow(&I, IRB.CreateMaskedLoad( + ShadowPtr, Alignment ? Alignment->value() : 0, Mask, + getShadow(PassThru), "_msmaskedld")); } else { setShadow(&I, getCleanShadow(&I)); } @@ -3278,7 +3279,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Clear out readonly/readnone attributes. AttrBuilder B; B.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); + .addAttribute(Attribute::ReadNone) + .addAttribute(Attribute::WriteOnly) + .addAttribute(Attribute::ArgMemOnly) + .addAttribute(Attribute::Speculatable); Func->removeAttributes(AttributeList::FunctionIndex, B); } @@ -3312,8 +3316,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { "ByVal argument is not a pointer!"); Size = DL.getTypeAllocSize(A->getType()->getPointerElementType()); if (ArgOffset + Size > kParamTLSSize) break; - unsigned ParamAlignment = CS.getParamAlignment(i); - unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment); + const MaybeAlign ParamAlignment(CS.getParamAlignment(i)); + MaybeAlign Alignment = llvm::None; + if (ParamAlignment) + Alignment = std::min(*ParamAlignment, kShadowTLSAlignment); Value *AShadowPtr = getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ false) @@ -3326,7 +3332,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Size = DL.getTypeAllocSize(A->getType()); if (ArgOffset + Size > kParamTLSSize) break; Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase, - kShadowTLSAlignment); + kShadowTLSAlignment.value()); Constant *Cst = dyn_cast<Constant>(ArgShadow); if (Cst && Cst->isNullValue()) ArgIsInitialized = true; } @@ -3352,7 +3358,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRBBefore(&I); // Until we have full dynamic coverage, make sure the retval shadow is 0. Value *Base = getShadowPtrForRetval(&I, IRBBefore); - IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment); + IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, + kShadowTLSAlignment.value()); BasicBlock::iterator NextInsn; if (CS.isCall()) { NextInsn = ++I.getIterator(); @@ -3376,7 +3383,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRBAfter(&*NextInsn); Value *RetvalShadow = IRBAfter.CreateAlignedLoad( getShadowTy(&I), getShadowPtrForRetval(&I, IRBAfter), - kShadowTLSAlignment, "_msret"); + kShadowTLSAlignment.value(), "_msret"); setShadow(&I, RetvalShadow); if (MS.TrackOrigins) setOrigin(&I, IRBAfter.CreateLoad(MS.OriginTy, @@ -3403,10 +3410,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (CheckReturnValue) { insertShadowCheck(RetVal, &I); Value *Shadow = getCleanShadow(RetVal); - IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value()); } else { Value *Shadow = getShadow(RetVal); - IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value()); if (MS.TrackOrigins) IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB)); } @@ -3447,11 +3454,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len}); } else { Value *ShadowBase, *OriginBase; - std::tie(ShadowBase, OriginBase) = - getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(), 1, /*isStore*/ true); + std::tie(ShadowBase, OriginBase) = getShadowOriginPtr( + &I, IRB, IRB.getInt8Ty(), Align::None(), /*isStore*/ true); Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0); - IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment()); + IRB.CreateMemSet(ShadowBase, PoisonValue, Len, + MaybeAlign(I.getAlignment())); } if (PoisonStack && MS.TrackOrigins) { @@ -3863,7 +3871,7 @@ struct VarArgAMD64Helper : public VarArgHelper { if (!ShadowBase) continue; Value *Shadow = MSV.getShadow(A); - IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment); + IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment.value()); if (MS.TrackOrigins) { Value *Origin = MSV.getOrigin(A); unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); @@ -3904,7 +3912,7 @@ struct VarArgAMD64Helper : public VarArgHelper { IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -3942,10 +3950,11 @@ struct VarArgAMD64Helper : public VarArgHelper { IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset), VAArgOverflowSize); VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); if (MS.TrackOrigins) { VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSOriginCopy, 8, MS.VAArgOriginTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS, + Align(8), CopySize); } } @@ -3964,7 +3973,7 @@ struct VarArgAMD64Helper : public VarArgHelper { Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; - unsigned Alignment = 16; + const Align Alignment = Align(16); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -4032,7 +4041,8 @@ struct VarArgMIPS64Helper : public VarArgHelper { VAArgOffset = alignTo(VAArgOffset, 8); if (!Base) continue; - IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, + kShadowTLSAlignment.value()); } Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset); @@ -4058,7 +4068,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4070,7 +4080,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4089,7 +4099,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { // If there is a va_start in this function, make a backup copy of // va_arg_tls somewhere in the function entry block. VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); } // Instrument va_start. @@ -4105,7 +4115,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -4203,7 +4213,8 @@ struct VarArgAArch64Helper : public VarArgHelper { continue; if (!Base) continue; - IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, + kShadowTLSAlignment.value()); } Constant *OverflowSize = ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset); @@ -4227,7 +4238,7 @@ struct VarArgAArch64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4239,7 +4250,7 @@ struct VarArgAArch64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4280,7 +4291,7 @@ struct VarArgAArch64Helper : public VarArgHelper { IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset), VAArgOverflowSize); VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); } Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize); @@ -4331,14 +4342,15 @@ struct VarArgAArch64Helper : public VarArgHelper { Value *GrRegSaveAreaShadowPtr = MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(), - /*Alignment*/ 8, /*isStore*/ true) + Align(8), /*isStore*/ true) .first; Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, GrRegSaveAreaShadowPtrOff); Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff); - IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, 8, GrSrcPtr, 8, GrCopySize); + IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8), + GrCopySize); // Again, but for FP/SIMD values. Value *VrRegSaveAreaShadowPtrOff = @@ -4346,7 +4358,7 @@ struct VarArgAArch64Helper : public VarArgHelper { Value *VrRegSaveAreaShadowPtr = MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(), - /*Alignment*/ 8, /*isStore*/ true) + Align(8), /*isStore*/ true) .first; Value *VrSrcPtr = IRB.CreateInBoundsGEP( @@ -4356,20 +4368,21 @@ struct VarArgAArch64Helper : public VarArgHelper { VrRegSaveAreaShadowPtrOff); Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff); - IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, 8, VrSrcPtr, 8, VrCopySize); + IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, Align(8), VrSrcPtr, Align(8), + VrCopySize); // And finally for remaining arguments. Value *StackSaveAreaShadowPtr = MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(), - /*Alignment*/ 16, /*isStore*/ true) + Align(16), /*isStore*/ true) .first; Value *StackSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, IRB.getInt32(AArch64VAEndOffset)); - IRB.CreateMemCpy(StackSaveAreaShadowPtr, 16, StackSrcPtr, 16, - VAArgOverflowSize); + IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr, + Align(16), VAArgOverflowSize); } } }; @@ -4461,7 +4474,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper { Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset - VAArgBase, ArgSize); if (Base) - IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, + kShadowTLSAlignment.value()); } VAArgOffset += ArgSize; VAArgOffset = alignTo(VAArgOffset, 8); @@ -4494,7 +4508,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4505,7 +4519,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); // Unpoison the whole __va_list_tag. @@ -4526,7 +4540,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { // If there is a va_start in this function, make a backup copy of // va_arg_tls somewhere in the function entry block. VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); } // Instrument va_start. @@ -4542,7 +4556,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -4595,7 +4609,10 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) { // Clear out readonly/readnone attributes. AttrBuilder B; B.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); + .addAttribute(Attribute::ReadNone) + .addAttribute(Attribute::WriteOnly) + .addAttribute(Attribute::ArgMemOnly) + .addAttribute(Attribute::Speculatable); F.removeAttributes(AttributeList::FunctionIndex, B); return Visitor.runOnFunction(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index ca1bb62389e9..cc96bdd1d516 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -47,6 +47,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" #include "CFGMST.h" #include "ValueProfileCollector.h" #include "llvm/ADT/APInt.h" @@ -92,6 +93,7 @@ #include "llvm/IR/ProfileSummary.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/InstrProfReader.h" @@ -106,7 +108,6 @@ #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" -#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/MisExpect.h" #include <algorithm> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index 9f81bb16d0a7..d0afe2959b39 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp index 81d92e724c7d..71ecfd9a2642 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp @@ -65,10 +65,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index f8fa9cad03b8..e6dc684c2e77 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -32,6 +32,7 @@ #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index ac274a155a80..9b7edad3444b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -26,7 +26,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -37,6 +36,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -45,6 +45,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index b341dd807508..7a01ec967fb5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -26,6 +26,7 @@ #include "ObjCARC.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Constants.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index 36aa513ec554..ecf8220ae95d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -34,6 +34,8 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp index 04e98d8f5577..205d8ddf151d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassAnalysisSupport.h" #include "llvm/PassRegistry.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 6653ff0bb91a..b80c1675050b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -58,8 +58,10 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -506,6 +508,20 @@ namespace { ARCInstKind &Class); void OptimizeIndividualCalls(Function &F); + /// Optimize an individual call, optionally passing the + /// GetArgRCIdentityRoot if it has already been computed. + void OptimizeIndividualCallImpl( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, ARCInstKind Class, const Value *Arg); + + /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV. If the + /// optimization occurs, returns true to indicate that the caller should + /// assume the instructions are dead. + bool OptimizeInlinedAutoreleaseRVCall( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, const Value *&Arg, ARCInstKind Class, + Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg); + void CheckForCFGHazards(const BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, BBState &MyStates) const; @@ -589,8 +605,7 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { } /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is -/// not a return value. Or, if it can be paired with an -/// objc_autoreleaseReturnValue, delete the pair and return true. +/// not a return value. bool ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for the argument being from an immediately preceding call or invoke. @@ -616,39 +631,6 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { } } - // Track PHIs which are equivalent to our Arg. - SmallDenseSet<const Value*, 2> EquivalentArgs; - EquivalentArgs.insert(Arg); - - // Add PHIs that are equivalent to Arg to ArgUsers. - if (const PHINode *PN = dyn_cast<PHINode>(Arg)) { - SmallVector<const Value *, 2> ArgUsers; - getEquivalentPHIs(*PN, ArgUsers); - EquivalentArgs.insert(ArgUsers.begin(), ArgUsers.end()); - } - - // Check for being preceded by an objc_autoreleaseReturnValue on the same - // pointer. In this case, we can delete the pair. - BasicBlock::iterator I = RetainRV->getIterator(), - Begin = RetainRV->getParent()->begin(); - if (I != Begin) { - do - --I; - while (I != Begin && IsNoopInstruction(&*I)); - if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV && - EquivalentArgs.count(GetArgRCIdentityRoot(&*I))) { - Changed = true; - ++NumPeeps; - - LLVM_DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n" - << "Erasing " << *RetainRV << "\n"); - - EraseInstruction(&*I); - EraseInstruction(RetainRV); - return true; - } - } - // Turn it to a plain objc_retain. Changed = true; ++NumPeeps; @@ -666,6 +648,62 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { return false; } +bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, const Value *&Arg, ARCInstKind Class, + Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg) { + // Must be in the same basic block. + assert(Inst->getParent() == AutoreleaseRV->getParent()); + + // Must operate on the same root. + Arg = GetArgRCIdentityRoot(Inst); + AutoreleaseRVArg = GetArgRCIdentityRoot(AutoreleaseRV); + if (Arg != AutoreleaseRVArg) { + // If there isn't an exact match, check if we have equivalent PHIs. + const PHINode *PN = dyn_cast<PHINode>(Arg); + if (!PN) + return false; + + SmallVector<const Value *, 4> ArgUsers; + getEquivalentPHIs(*PN, ArgUsers); + if (llvm::find(ArgUsers, AutoreleaseRVArg) == ArgUsers.end()) + return false; + } + + // Okay, this is a match. Merge them. + ++NumPeeps; + LLVM_DEBUG(dbgs() << "Found inlined objc_autoreleaseReturnValue '" + << *AutoreleaseRV << "' paired with '" << *Inst << "'\n"); + + // Delete the RV pair, starting with the AutoreleaseRV. + AutoreleaseRV->replaceAllUsesWith( + cast<CallInst>(AutoreleaseRV)->getArgOperand(0)); + EraseInstruction(AutoreleaseRV); + if (Class == ARCInstKind::RetainRV) { + // AutoreleaseRV and RetainRV cancel out. Delete the RetainRV. + Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); + EraseInstruction(Inst); + return true; + } + + // ClaimRV is a frontend peephole for RetainRV + Release. Since the + // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release. + assert(Class == ARCInstKind::ClaimRV); + Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0); + CallInst *Release = CallInst::Create( + EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst); + assert(IsAlwaysTail(ARCInstKind::ClaimRV) && + "Expected ClaimRV to be safe to tail call"); + Release->setTailCall(); + Inst->replaceAllUsesWith(CallArg); + EraseInstruction(Inst); + + // Run the normal optimizations on Release. + OptimizeIndividualCallImpl(F, BlockColors, Release, ARCInstKind::Release, + Arg); + return true; +} + /// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not /// used as a return value. void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, @@ -752,286 +790,370 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) BlockColors = colorEHFunclets(F); + // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired + // with RetainRV and ClaimRV. + Instruction *DelayedAutoreleaseRV = nullptr; + const Value *DelayedAutoreleaseRVArg = nullptr; + auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) { + assert(!DelayedAutoreleaseRV || !AutoreleaseRV); + DelayedAutoreleaseRV = AutoreleaseRV; + DelayedAutoreleaseRVArg = nullptr; + }; + auto optimizeDelayedAutoreleaseRV = [&]() { + if (!DelayedAutoreleaseRV) + return; + OptimizeIndividualCallImpl(F, BlockColors, DelayedAutoreleaseRV, + ARCInstKind::AutoreleaseRV, + DelayedAutoreleaseRVArg); + setDelayedAutoreleaseRV(nullptr); + }; + auto shouldDelayAutoreleaseRV = [&](Instruction *NonARCInst) { + // Nothing to delay, but we may as well skip the logic below. + if (!DelayedAutoreleaseRV) + return true; + + // If we hit the end of the basic block we're not going to find an RV-pair. + // Stop delaying. + if (NonARCInst->isTerminator()) + return false; + + // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and + // ClaimRV, it's probably safe to skip over even opaque function calls + // here since OptimizeInlinedAutoreleaseRVCall will confirm that they + // have the same RCIdentityRoot. However, what really matters is + // skipping instructions or intrinsics that the inliner could leave behind; + // be conservative for now and don't skip over opaque calls, which could + // potentially include other ARC calls. + auto *CB = dyn_cast<CallBase>(NonARCInst); + if (!CB) + return true; + return CB->getIntrinsicID() != Intrinsic::not_intrinsic; + }; + // Visit all objc_* calls in F. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; ARCInstKind Class = GetBasicARCInstKind(Inst); - LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n"); - - // Some of the ARC calls can be deleted if their arguments are global - // variables that are inert in ARC. - if (IsNoopOnGlobal(Class)) { - Value *Opnd = Inst->getOperand(0); - if (auto *GV = dyn_cast<GlobalVariable>(Opnd->stripPointerCasts())) - if (GV->hasAttribute("objc_arc_inert")) { - if (!Inst->getType()->isVoidTy()) - Inst->replaceAllUsesWith(Opnd); - Inst->eraseFromParent(); - continue; - } - } - + // Skip this loop if this instruction isn't itself an ARC intrinsic. + const Value *Arg = nullptr; switch (Class) { - default: break; - - // Delete no-op casts. These function calls have special semantics, but - // the semantics are entirely implemented via lowering in the front-end, - // so by the time they reach the optimizer, they are just no-op calls - // which return their argument. - // - // There are gray areas here, as the ability to cast reference-counted - // pointers to raw void* and back allows code to break ARC assumptions, - // however these are currently considered to be unimportant. - case ARCInstKind::NoopCast: - Changed = true; - ++NumNoops; - LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n"); - EraseInstruction(Inst); - continue; - - // If the pointer-to-weak-pointer is null, it's undefined behavior. - case ARCInstKind::StoreWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::InitWeak: - case ARCInstKind::DestroyWeak: { - CallInst *CI = cast<CallInst>(Inst); - if (IsNullOrUndef(CI->getArgOperand(0))) { - Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), - CI); - Value *NewValue = UndefValue::get(CI->getType()); - LLVM_DEBUG( - dbgs() << "A null pointer-to-weak-pointer is undefined behavior." - "\nOld = " - << *CI << "\nNew = " << *NewValue << "\n"); - CI->replaceAllUsesWith(NewValue); - CI->eraseFromParent(); - continue; - } - break; - } - case ARCInstKind::CopyWeak: - case ARCInstKind::MoveWeak: { - CallInst *CI = cast<CallInst>(Inst); - if (IsNullOrUndef(CI->getArgOperand(0)) || - IsNullOrUndef(CI->getArgOperand(1))) { - Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), - CI); - - Value *NewValue = UndefValue::get(CI->getType()); - LLVM_DEBUG( - dbgs() << "A null pointer-to-weak-pointer is undefined behavior." - "\nOld = " - << *CI << "\nNew = " << *NewValue << "\n"); - - CI->replaceAllUsesWith(NewValue); - CI->eraseFromParent(); - continue; - } - break; - } - case ARCInstKind::RetainRV: - if (OptimizeRetainRVCall(F, Inst)) - continue; + default: + optimizeDelayedAutoreleaseRV(); break; + case ARCInstKind::CallOrUser: + case ARCInstKind::User: + case ARCInstKind::None: + // This is a non-ARC instruction. If we're delaying an AutoreleaseRV, + // check if it's safe to skip over it; if not, optimize the AutoreleaseRV + // now. + if (!shouldDelayAutoreleaseRV(Inst)) + optimizeDelayedAutoreleaseRV(); + continue; case ARCInstKind::AutoreleaseRV: - OptimizeAutoreleaseRVCall(F, Inst, Class); + optimizeDelayedAutoreleaseRV(); + setDelayedAutoreleaseRV(Inst); + continue; + case ARCInstKind::RetainRV: + case ARCInstKind::ClaimRV: + if (DelayedAutoreleaseRV) { + // We have a potential RV pair. Check if they cancel out. + if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class, + DelayedAutoreleaseRV, + DelayedAutoreleaseRVArg)) { + setDelayedAutoreleaseRV(nullptr); + continue; + } + optimizeDelayedAutoreleaseRV(); + } break; } - // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. - if (IsAutorelease(Class) && Inst->use_empty()) { - CallInst *Call = cast<CallInst>(Inst); - const Value *Arg = Call->getArgOperand(0); - Arg = FindSingleUseIdentifiedObject(Arg); - if (Arg) { - Changed = true; - ++NumAutoreleases; - - // Create the declaration lazily. - LLVMContext &C = Inst->getContext(); - - Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release); - CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "", - Call); - NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), - MDNode::get(C, None)); - - LLVM_DEBUG( - dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " - "since x is otherwise unused.\nOld: " - << *Call << "\nNew: " << *NewCall << "\n"); - - EraseInstruction(Call); - Inst = NewCall; - Class = ARCInstKind::Release; + OptimizeIndividualCallImpl(F, BlockColors, Inst, Class, Arg); + } + + // Catch the final delayed AutoreleaseRV. + optimizeDelayedAutoreleaseRV(); +} + +void ObjCARCOpt::OptimizeIndividualCallImpl( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, ARCInstKind Class, const Value *Arg) { + LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n"); + + // Some of the ARC calls can be deleted if their arguments are global + // variables that are inert in ARC. + if (IsNoopOnGlobal(Class)) { + Value *Opnd = Inst->getOperand(0); + if (auto *GV = dyn_cast<GlobalVariable>(Opnd->stripPointerCasts())) + if (GV->hasAttribute("objc_arc_inert")) { + if (!Inst->getType()->isVoidTy()) + Inst->replaceAllUsesWith(Opnd); + Inst->eraseFromParent(); + return; } - } + } - // For functions which can never be passed stack arguments, add - // a tail keyword. - if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) { + switch (Class) { + default: + break; + + // Delete no-op casts. These function calls have special semantics, but + // the semantics are entirely implemented via lowering in the front-end, + // so by the time they reach the optimizer, they are just no-op calls + // which return their argument. + // + // There are gray areas here, as the ability to cast reference-counted + // pointers to raw void* and back allows code to break ARC assumptions, + // however these are currently considered to be unimportant. + case ARCInstKind::NoopCast: + Changed = true; + ++NumNoops; + LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n"); + EraseInstruction(Inst); + return; + + // If the pointer-to-weak-pointer is null, it's undefined behavior. + case ARCInstKind::StoreWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::InitWeak: + case ARCInstKind::DestroyWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (IsNullOrUndef(CI->getArgOperand(0))) { Changed = true; + Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), CI); + Value *NewValue = UndefValue::get(CI->getType()); LLVM_DEBUG( - dbgs() << "Adding tail keyword to function since it can never be " - "passed stack args: " - << *Inst << "\n"); - cast<CallInst>(Inst)->setTailCall(); + dbgs() << "A null pointer-to-weak-pointer is undefined behavior." + "\nOld = " + << *CI << "\nNew = " << *NewValue << "\n"); + CI->replaceAllUsesWith(NewValue); + CI->eraseFromParent(); + return; } - - // Ensure that functions that can never have a "tail" keyword due to the - // semantics of ARC truly do not do so. - if (IsNeverTail(Class)) { + break; + } + case ARCInstKind::CopyWeak: + case ARCInstKind::MoveWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (IsNullOrUndef(CI->getArgOperand(0)) || + IsNullOrUndef(CI->getArgOperand(1))) { Changed = true; - LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst - << "\n"); - cast<CallInst>(Inst)->setTailCall(false); + Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), CI); + + Value *NewValue = UndefValue::get(CI->getType()); + LLVM_DEBUG( + dbgs() << "A null pointer-to-weak-pointer is undefined behavior." + "\nOld = " + << *CI << "\nNew = " << *NewValue << "\n"); + + CI->replaceAllUsesWith(NewValue); + CI->eraseFromParent(); + return; } + break; + } + case ARCInstKind::RetainRV: + if (OptimizeRetainRVCall(F, Inst)) + return; + break; + case ARCInstKind::AutoreleaseRV: + OptimizeAutoreleaseRVCall(F, Inst, Class); + break; + } - // Set nounwind as needed. - if (IsNoThrow(Class)) { + // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. + if (IsAutorelease(Class) && Inst->use_empty()) { + CallInst *Call = cast<CallInst>(Inst); + const Value *Arg = Call->getArgOperand(0); + Arg = FindSingleUseIdentifiedObject(Arg); + if (Arg) { Changed = true; - LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " - << *Inst << "\n"); - cast<CallInst>(Inst)->setDoesNotThrow(); - } + ++NumAutoreleases; - if (!IsNoopOnNull(Class)) { - UsedInThisFunction |= 1 << unsigned(Class); - continue; - } + // Create the declaration lazily. + LLVMContext &C = Inst->getContext(); - const Value *Arg = GetArgRCIdentityRoot(Inst); + Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release); + CallInst *NewCall = + CallInst::Create(Decl, Call->getArgOperand(0), "", Call); + NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), + MDNode::get(C, None)); - // ARC calls with null are no-ops. Delete them. - if (IsNullOrUndef(Arg)) { - Changed = true; - ++NumNoops; - LLVM_DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst - << "\n"); - EraseInstruction(Inst); - continue; + LLVM_DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " + "since x is otherwise unused.\nOld: " + << *Call << "\nNew: " << *NewCall << "\n"); + + EraseInstruction(Call); + Inst = NewCall; + Class = ARCInstKind::Release; } + } + + // For functions which can never be passed stack arguments, add + // a tail keyword. + if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) { + Changed = true; + LLVM_DEBUG( + dbgs() << "Adding tail keyword to function since it can never be " + "passed stack args: " + << *Inst << "\n"); + cast<CallInst>(Inst)->setTailCall(); + } + + // Ensure that functions that can never have a "tail" keyword due to the + // semantics of ARC truly do not do so. + if (IsNeverTail(Class)) { + Changed = true; + LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst + << "\n"); + cast<CallInst>(Inst)->setTailCall(false); + } + + // Set nounwind as needed. + if (IsNoThrow(Class)) { + Changed = true; + LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst + << "\n"); + cast<CallInst>(Inst)->setDoesNotThrow(); + } - // Keep track of which of retain, release, autorelease, and retain_block - // are actually present in this function. + // Note: This catches instructions unrelated to ARC. + if (!IsNoopOnNull(Class)) { UsedInThisFunction |= 1 << unsigned(Class); + return; + } + + // If we haven't already looked up the root, look it up now. + if (!Arg) + Arg = GetArgRCIdentityRoot(Inst); + + // ARC calls with null are no-ops. Delete them. + if (IsNullOrUndef(Arg)) { + Changed = true; + ++NumNoops; + LLVM_DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst + << "\n"); + EraseInstruction(Inst); + return; + } + + // Keep track of which of retain, release, autorelease, and retain_block + // are actually present in this function. + UsedInThisFunction |= 1 << unsigned(Class); + + // If Arg is a PHI, and one or more incoming values to the + // PHI are null, and the call is control-equivalent to the PHI, and there + // are no relevant side effects between the PHI and the call, and the call + // is not a release that doesn't have the clang.imprecise_release tag, the + // call could be pushed up to just those paths with non-null incoming + // values. For now, don't bother splitting critical edges for this. + if (Class == ARCInstKind::Release && + !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease))) + return; + + SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist; + Worklist.push_back(std::make_pair(Inst, Arg)); + do { + std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val(); + Inst = Pair.first; + Arg = Pair.second; - // If Arg is a PHI, and one or more incoming values to the - // PHI are null, and the call is control-equivalent to the PHI, and there - // are no relevant side effects between the PHI and the call, and the call - // is not a release that doesn't have the clang.imprecise_release tag, the - // call could be pushed up to just those paths with non-null incoming - // values. For now, don't bother splitting critical edges for this. - if (Class == ARCInstKind::Release && - !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease))) + const PHINode *PN = dyn_cast<PHINode>(Arg); + if (!PN) continue; - SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist; - Worklist.push_back(std::make_pair(Inst, Arg)); - do { - std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val(); - Inst = Pair.first; - Arg = Pair.second; - - const PHINode *PN = dyn_cast<PHINode>(Arg); - if (!PN) continue; - - // Determine if the PHI has any null operands, or any incoming - // critical edges. - bool HasNull = false; - bool HasCriticalEdges = false; - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = - GetRCIdentityRoot(PN->getIncomingValue(i)); - if (IsNullOrUndef(Incoming)) - HasNull = true; - else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() != - 1) { - HasCriticalEdges = true; - break; - } + // Determine if the PHI has any null operands, or any incoming + // critical edges. + bool HasNull = false; + bool HasCriticalEdges = false; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i)); + if (IsNullOrUndef(Incoming)) + HasNull = true; + else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() != + 1) { + HasCriticalEdges = true; + break; } - // If we have null operands and no critical edges, optimize. - if (!HasCriticalEdges && HasNull) { - SmallPtrSet<Instruction *, 4> DependingInstructions; - SmallPtrSet<const BasicBlock *, 4> Visited; - - // Check that there is nothing that cares about the reference - // count between the call and the phi. - switch (Class) { - case ARCInstKind::Retain: - case ARCInstKind::RetainBlock: - // These can always be moved up. - break; - case ARCInstKind::Release: - // These can't be moved across things that care about the retain - // count. - FindDependencies(NeedsPositiveRetainCount, Arg, - Inst->getParent(), Inst, - DependingInstructions, Visited, PA); - break; - case ARCInstKind::Autorelease: - // These can't be moved across autorelease pool scope boundaries. - FindDependencies(AutoreleasePoolBoundary, Arg, - Inst->getParent(), Inst, - DependingInstructions, Visited, PA); - break; - case ARCInstKind::ClaimRV: - case ARCInstKind::RetainRV: - case ARCInstKind::AutoreleaseRV: - // Don't move these; the RV optimization depends on the autoreleaseRV - // being tail called, and the retainRV being immediately after a call - // (which might still happen if we get lucky with codegen layout, but - // it's not worth taking the chance). - continue; - default: - llvm_unreachable("Invalid dependence flavor"); - } + } + // If we have null operands and no critical edges, optimize. + if (HasCriticalEdges) + continue; + if (!HasNull) + continue; - if (DependingInstructions.size() == 1 && - *DependingInstructions.begin() == PN) { - Changed = true; - ++NumPartialNoops; - // Clone the call into each predecessor that has a non-null value. - CallInst *CInst = cast<CallInst>(Inst); - Type *ParamTy = CInst->getArgOperand(0)->getType(); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = - GetRCIdentityRoot(PN->getIncomingValue(i)); - if (!IsNullOrUndef(Incoming)) { - Value *Op = PN->getIncomingValue(i); - Instruction *InsertPos = &PN->getIncomingBlock(i)->back(); - CallInst *Clone = cast<CallInst>(CloneCallInstForBB( - *CInst, *InsertPos->getParent(), BlockColors)); - if (Op->getType() != ParamTy) - Op = new BitCastInst(Op, ParamTy, "", InsertPos); - Clone->setArgOperand(0, Op); - Clone->insertBefore(InsertPos); - - LLVM_DEBUG(dbgs() << "Cloning " << *CInst - << "\n" - "And inserting clone at " - << *InsertPos << "\n"); - Worklist.push_back(std::make_pair(Clone, Incoming)); - } - } - // Erase the original call. - LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n"); - EraseInstruction(CInst); - continue; - } - } - } while (!Worklist.empty()); - } + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + + // Check that there is nothing that cares about the reference + // count between the call and the phi. + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainBlock: + // These can always be moved up. + break; + case ARCInstKind::Release: + // These can't be moved across things that care about the retain + // count. + FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case ARCInstKind::Autorelease: + // These can't be moved across autorelease pool scope boundaries. + FindDependencies(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case ARCInstKind::ClaimRV: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: + // Don't move these; the RV optimization depends on the autoreleaseRV + // being tail called, and the retainRV being immediately after a call + // (which might still happen if we get lucky with codegen layout, but + // it's not worth taking the chance). + continue; + default: + llvm_unreachable("Invalid dependence flavor"); + } + + if (DependingInstructions.size() != 1) + continue; + if (*DependingInstructions.begin() != PN) + continue; + + Changed = true; + ++NumPartialNoops; + // Clone the call into each predecessor that has a non-null value. + CallInst *CInst = cast<CallInst>(Inst); + Type *ParamTy = CInst->getArgOperand(0)->getType(); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i)); + if (IsNullOrUndef(Incoming)) + continue; + Value *Op = PN->getIncomingValue(i); + Instruction *InsertPos = &PN->getIncomingBlock(i)->back(); + CallInst *Clone = cast<CallInst>( + CloneCallInstForBB(*CInst, *InsertPos->getParent(), BlockColors)); + if (Op->getType() != ParamTy) + Op = new BitCastInst(Op, ParamTy, "", InsertPos); + Clone->setArgOperand(0, Op); + Clone->insertBefore(InsertPos); + + LLVM_DEBUG(dbgs() << "Cloning " << *CInst << "\n" + "And inserting clone at " + << *InsertPos << "\n"); + Worklist.push_back(std::make_pair(Clone, Incoming)); + } + // Erase the original call. + LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n"); + EraseInstruction(CInst); + } while (!Worklist.empty()); } /// If we have a top down pointer in the S_Use state, make sure that there are diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp index b768f7973b87..99a2055aba94 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp @@ -13,6 +13,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp index 7f7460c5746a..cc3d3bf7cdbf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 0e9f03a06061..06deaf3c4f9a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -15,6 +15,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #define AA_NAME "alignment-from-assumptions" #define DEBUG_TYPE AA_NAME #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp index 9bd387c33e80..0fa38fa80b17 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -19,13 +19,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "bdce" @@ -101,7 +102,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { (I.getType()->isIntOrIntVectorTy() && DB.getDemandedBits(&I).isNullValue() && wouldInstructionBeTriviallyDead(&I))) { - salvageDebugInfo(I); + salvageDebugInfoOrMarkUndef(I); Worklist.push_back(&I); I.dropAllReferences(); Changed = true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index c3fba923104f..e34c011b1c87 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -59,13 +59,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 9f340afbf7c2..5bfece010bec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -14,7 +14,7 @@ // cost. If the constant can be folded into the instruction (the cost is // TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't // consider it expensive and leave it alone. This is the default behavior and -// the default implementation of getIntImmCost will always return TCC_Free. +// the default implementation of getIntImmCostInst will always return TCC_Free. // // If the cost is more than TCC_BASIC, then the integer constant can't be folded // into the instruction and it might be beneficial to hoist the constant. @@ -43,7 +43,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -54,6 +53,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/Casting.h" @@ -61,6 +61,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include <algorithm> #include <cassert> @@ -361,11 +362,11 @@ void ConstantHoistingPass::collectConstantCandidates( // Ask the target about the cost of materializing the constant for the given // instruction and operand index. if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst)) - Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx, - ConstInt->getValue(), ConstInt->getType()); + Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx, + ConstInt->getValue(), ConstInt->getType()); else - Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(), - ConstInt->getType()); + Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(), + ConstInt->getType()); // Ignore cheap integer constants. if (Cost > TargetTransformInfo::TCC_Basic) { @@ -415,7 +416,7 @@ void ConstantHoistingPass::collectConstantCandidates( // usually lowered to a load from constant pool. Such operation is unlikely // to be cheaper than compute it by <Base + Offset>, which can be lowered to // an ADD instruction or folded into Load/Store instruction. - int Cost = TTI->getIntImmCost(Instruction::Add, 1, Offset, PtrIntTy); + int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy); ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV]; ConstCandMapType::iterator Itr; bool Inserted; @@ -486,9 +487,10 @@ void ConstantHoistingPass::collectConstantCandidates( // Scan all operands. for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { // The cost of materializing the constants (defined in - // `TargetTransformInfo::getIntImmCost`) for instructions which only take - // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So - // it's safe for us to collect constant candidates from all IntrinsicInsts. + // `TargetTransformInfo::getIntImmCostInst`) for instructions which only + // take constant variables is lower than `TargetTransformInfo::TCC_Basic`. + // So it's safe for us to collect constant candidates from all + // IntrinsicInsts. if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) { collectConstantCandidates(ConstCandMap, Inst, Idx); } @@ -499,9 +501,13 @@ void ConstantHoistingPass::collectConstantCandidates( /// into an instruction itself. void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; - for (BasicBlock &BB : Fn) + for (BasicBlock &BB : Fn) { + // Ignore unreachable basic blocks. + if (!DT->isReachableFromEntry(&BB)) + continue; for (Instruction &Inst : BB) collectConstantCandidates(ConstCandMap, &Inst); + } } // This helper function is necessary to deal with values that have different @@ -552,7 +558,8 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, unsigned NumUses = 0; bool OptForSize = Entry->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI); + llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI, + PGSOQueryType::IRPass); if (!OptForSize || std::distance(S,E) > 100) { for (auto ConstCand = S; ConstCand != E; ++ConstCand) { NumUses += ConstCand->Uses.size(); @@ -575,7 +582,7 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, for (auto User : ConstCand->Uses) { unsigned Opcode = User.Inst->getOpcode(); unsigned OpndIdx = User.OpndIdx; - Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty); + Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty); LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n"); for (auto C2 = S; C2 != E; ++C2) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp index e9e6afe3fdd4..73bf1d521b1d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 2ef85268df48..3435bc7f5eaa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -194,7 +195,14 @@ static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI, } // All constant incoming values map to the same variable along the incoming - // edges of the phi. The phi is unnecessary. + // edges of the phi. The phi is unnecessary. However, we must drop all + // poison-generating flags to ensure that no poison is propagated to the phi + // location by performing this substitution. + // Warning: If the underlying analysis changes, this may not be enough to + // guarantee that poison is not propagated. + // TODO: We may be able to re-infer flags by re-analyzing the instruction. + if (auto *CommonInst = dyn_cast<Instruction>(CommonValue)) + CommonInst->dropPoisonGeneratingFlags(); P->replaceAllUsesWith(CommonValue); P->eraseFromParent(); ++NumPhiCommon; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp index a79d775aa7f3..a4b0c8df98f6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp @@ -19,12 +19,14 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "dce" @@ -80,6 +82,43 @@ Pass *llvm::createDeadInstEliminationPass() { return new DeadInstElimination(); } +//===--------------------------------------------------------------------===// +// RedundantDbgInstElimination pass implementation +// + +namespace { +struct RedundantDbgInstElimination : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + RedundantDbgInstElimination() : FunctionPass(ID) { + initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + bool Changed = false; + for (auto &BB : F) + Changed |= RemoveRedundantDbgInstrs(&BB); + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } +}; +} + +char RedundantDbgInstElimination::ID = 0; +INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim", + "Redundant Dbg Instruction Elimination", false, false) + +Pass *llvm::createRedundantDbgInstEliminationPass() { + return new RedundantDbgInstElimination(); +} + +//===--------------------------------------------------------------------===// +// DeadCodeElimination pass implementation +// + static bool DCEInstruction(Instruction *I, SmallSetVector<Instruction *, 16> &WorkList, const TargetLibraryInfo *TLI) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 685de82810ed..1ba4aab999e1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Scalar/DeadStoreElimination.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -48,6 +49,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -99,6 +101,7 @@ static void deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, MemoryDependenceResults &MD, const TargetLibraryInfo &TLI, InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst, SmallSetVector<const Value *, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; @@ -112,6 +115,10 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, // Before we touch this instruction, remove it from memdep! do { Instruction *DeadInst = NowDeadInsts.pop_back_val(); + // Mark the DeadInst as dead in the list of throwable instructions. + auto It = ThrowableInst.find(DeadInst); + if (It != ThrowableInst.end()) + ThrowableInst[It->first] = false; ++NumFastOther; // Try to preserve debug information attached to the dead instruction. @@ -144,6 +151,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, DeadInst->eraseFromParent(); } while (!NowDeadInsts.empty()); *BBI = NewIter; + // Pop dead entries from back of ThrowableInst till we find an alive entry. + while (!ThrowableInst.empty() && !ThrowableInst.back().second) + ThrowableInst.pop_back(); } /// Does this instruction write some memory? This only returns true for things @@ -169,15 +179,18 @@ static bool hasAnalyzableMemoryWrite(Instruction *I, } if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { - StringRef FnName = F->getName(); - if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy)) - return true; - if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy)) - return true; - if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat)) - return true; - if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat)) - return true; + LibFunc LF; + if (TLI.getLibFunc(*F, LF) && TLI.has(LF)) { + switch (LF) { + case LibFunc_strcpy: + case LibFunc_strncpy: + case LibFunc_strcat: + case LibFunc_strncat: + return true; + default: + return false; + } + } } } return false; @@ -656,7 +669,8 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, static bool handleFree(CallInst *F, AliasAnalysis *AA, MemoryDependenceResults *MD, DominatorTree *DT, const TargetLibraryInfo *TLI, - InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB) { + InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); @@ -690,7 +704,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, // DCE instructions only used to calculate that store. BasicBlock::iterator BBI(Dependency); - deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); ++NumFastStores; MadeChange = true; @@ -747,8 +762,8 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc, static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, MemoryDependenceResults *MD, const TargetLibraryInfo *TLI, - InstOverlapIntervalsTy &IOL, - OrderedBasicBlock &OBB) { + InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { bool MadeChange = false; // Keep track of all of the stack objects that are dead at the end of the @@ -809,7 +824,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, << '\n'); // DCE instructions only used to calculate that store. - deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, + deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst, &DeadStackObjects); ++NumFastStores; MadeChange = true; @@ -821,7 +836,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, if (isInstructionTriviallyDead(&*BBI, TLI)) { LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: " << *&*BBI << '\n'); - deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, + deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst, &DeadStackObjects); ++NumFastOther; MadeChange = true; @@ -1028,7 +1043,8 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, const DataLayout &DL, const TargetLibraryInfo *TLI, InstOverlapIntervalsTy &IOL, - OrderedBasicBlock &OBB) { + OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { // Must be a store instruction. StoreInst *SI = dyn_cast<StoreInst>(Inst); if (!SI) @@ -1044,7 +1060,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst); ++NumRedundantStores; return true; } @@ -1062,7 +1078,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: " << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); - deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst); ++NumRedundantStores; return true; } @@ -1077,7 +1093,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, bool MadeChange = false; OrderedBasicBlock OBB(&BB); - Instruction *LastThrowing = nullptr; + MapVector<Instruction *, bool> ThrowableInst; // A map of interval maps representing partially-overwritten value parts. InstOverlapIntervalsTy IOL; @@ -1086,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { // Handle 'free' calls specially. if (CallInst *F = isFreeCall(&*BBI, TLI)) { - MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB); + MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB, ThrowableInst); // Increment BBI after handleFree has potentially deleted instructions. // This ensures we maintain a valid iterator. ++BBI; @@ -1096,7 +1112,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, Instruction *Inst = &*BBI++; if (Inst->mayThrow()) { - LastThrowing = Inst; + ThrowableInst[Inst] = true; continue; } @@ -1105,7 +1121,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, continue; // eliminateNoopStore will update in iterator, if necessary. - if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB)) { + if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB, + ThrowableInst)) { MadeChange = true; continue; } @@ -1148,6 +1165,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, if (!DepLoc.Ptr) break; + // Find the last throwable instruction not removed by call to + // deleteDeadInstruction. + Instruction *LastThrowing = nullptr; + if (!ThrowableInst.empty()) + LastThrowing = ThrowableInst.back().first; + // Make sure we don't look past a call which might throw. This is an // issue because MemoryDependenceAnalysis works in the wrong direction: // it finds instructions which dominate the current instruction, rather than @@ -1187,7 +1210,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); ++NumFastStores; MadeChange = true; @@ -1269,8 +1293,10 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, OBB.replaceInstruction(DepWrite, SI); // Delete the old stores and now-dead instructions that feed them. - deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB); - deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); MadeChange = true; // We erased DepWrite and Inst (Loc); start over. @@ -1305,7 +1331,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) - MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB); + MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB, ThrowableInst); return MadeChange; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp index 934853507478..132dfc8f6da1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index ce540683dae2..40c1ba88354f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -45,6 +44,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/AtomicOrdering.h" @@ -55,6 +55,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <deque> #include <memory> @@ -906,8 +907,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); continue; } - if (!salvageDebugInfo(*Inst)) - replaceDbgUsesWithUndef(Inst); + + salvageDebugInfoOrMarkUndef(*Inst); removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index e6abf1ceb026..72512430b366 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp index 4d2eac0451df..af223cc837f2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -11,6 +11,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "float2int" #include "llvm/Transforms/Scalar/Float2Int.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp index 743353eaea22..1e6aab14e7b4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp @@ -64,6 +64,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -112,7 +113,7 @@ static cl::opt<uint32_t> MaxNumDeps( struct llvm::GVN::Expression { uint32_t opcode; - Type *type; + Type *type = nullptr; bool commutative = false; SmallVector<uint32_t, 4> varargs; @@ -173,7 +174,7 @@ struct llvm::gvn::AvailableValue { PointerIntPair<Value *, 2, ValType> Val; /// Offset - The byte offset in Val that is interesting for the load query. - unsigned Offset; + unsigned Offset = 0; static AvailableValue get(Value *V, unsigned Offset = 0) { AvailableValue Res; @@ -237,7 +238,7 @@ struct llvm::gvn::AvailableValue { /// the associated BasicBlock. struct llvm::gvn::AvailableValueInBlock { /// BB - The basic block in question. - BasicBlock *BB; + BasicBlock *BB = nullptr; /// AV - The actual available value AvailableValue AV; @@ -364,6 +365,7 @@ GVN::ValueTable::ValueTable() = default; GVN::ValueTable::ValueTable(const ValueTable &) = default; GVN::ValueTable::ValueTable(ValueTable &&) = default; GVN::ValueTable::~ValueTable() = default; +GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { @@ -1387,6 +1389,59 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); } +static bool impliesEquivalanceIfTrue(CmpInst* Cmp) { + if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ) + return true; + + // Floating point comparisons can be equal, but not equivalent. Cases: + // NaNs for unordered operators + // +0.0 vs 0.0 for all operators + if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ || + (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ && + Cmp->getFastMathFlags().noNaNs())) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + // If we can prove either side non-zero, then equality must imply + // equivalence. + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) + return true; + if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) + return true;; + // TODO: Handle vector floating point constants + } + return false; +} + +static bool impliesEquivalanceIfFalse(CmpInst* Cmp) { + if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE) + return true; + + // Floating point comparisons can be equal, but not equivelent. Cases: + // NaNs for unordered operators + // +0.0 vs 0.0 for all operators + if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE && + Cmp->getFastMathFlags().noNaNs()) || + Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + // If we can prove either side non-zero, then equality must imply + // equivalence. + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) + return true; + if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) + return true;; + // TODO: Handle vector floating point constants + } + return false; +} + + static bool hasUsersIn(Value *V, BasicBlock *BB) { for (User *U : V->users()) if (isa<Instruction>(U) && @@ -1451,10 +1506,7 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { // call void @llvm.assume(i1 %cmp) // ret float %load ; will change it to ret float %0 if (auto *CmpI = dyn_cast<CmpInst>(V)) { - if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || - CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || - (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ && - CmpI->getFastMathFlags().noNaNs())) { + if (impliesEquivalanceIfTrue(CmpI)) { Value *CmpLHS = CmpI->getOperand(0); Value *CmpRHS = CmpI->getOperand(1); // Heuristically pick the better replacement -- the choice of heuristic @@ -1481,12 +1533,6 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS)) return Changed; - // +0.0 and -0.0 compare equal, but do not imply equivalence. Unless we - // can prove equivalence, bail. - if (CmpRHS->getType()->isFloatTy() && - (!isa<ConstantFP>(CmpRHS) || cast<ConstantFP>(CmpRHS)->isZero())) - return Changed; - LLVM_DEBUG(dbgs() << "Replacing dominated uses of " << *CmpLHS << " with " << *CmpRHS << " in block " @@ -1875,27 +1921,12 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); // If "A == B" is known true, or "A != B" is known false, then replace - // A with B everywhere in the scope. - if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || - (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) + // A with B everywhere in the scope. For floating point operations, we + // have to be careful since equality does not always imply equivalance. + if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) || + (isKnownFalse && impliesEquivalanceIfFalse(Cmp))) Worklist.push_back(std::make_pair(Op0, Op1)); - // Handle the floating point versions of equality comparisons too. - if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) || - (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) { - - // Floating point -0.0 and 0.0 compare equal, so we can only - // propagate values if we know that we have a constant and that - // its value is non-zero. - - // FIXME: We should do this optimization if 'no signed zeros' is - // applicable via an instruction-level fast-math-flag or some other - // indicator that relaxed FP semantics are being used. - - if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero()) - Worklist.push_back(std::make_pair(Op0, Op1)); - } - // If "A >= B" is known true, replace "A < B" with false everywhere. CmpInst::Predicate NotPred = Cmp->getInversePredicate(); Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp index c87e41484b13..e1796f6bf05a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -47,7 +47,6 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/PostDominators.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -65,6 +64,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -72,6 +72,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <iterator> @@ -956,7 +957,8 @@ private: if (MoveAccess && NewMemAcc) { // The definition of this ld/st will not change: ld/st hoisting is // legal when the ld/st is not moved past its current definition. - MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End); + MSSAUpdater->moveToPlace(NewMemAcc, DestBB, + MemorySSA::BeforeTerminator); } // Replace all other instructions with Repl with memory access NewMemAcc. @@ -1067,6 +1069,9 @@ private: ++NI; } + if (MSSA && VerifyMemorySSA) + MSSA->verifyMemorySSA(); + NumHoisted += NL + NS + NC + NI; NumRemoved += NR; NumLoadsHoisted += NL; @@ -1168,6 +1173,7 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<MemorySSAWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); } }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp index 054025755c69..6d0a4975e266 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -47,7 +47,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -59,6 +58,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" @@ -71,6 +71,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/GVNExpression.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp index 2697d7809568..a3eba27a4d90 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -39,7 +39,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/GuardWidening.h" -#include <functional> #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Statistic.h" @@ -53,11 +52,15 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/GuardUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include <functional> using namespace llvm; @@ -66,22 +69,6 @@ using namespace llvm; STATISTIC(GuardsEliminated, "Number of eliminated guards"); STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches"); -static cl::opt<bool> WidenFrequentBranches( - "guard-widening-widen-frequent-branches", cl::Hidden, - cl::desc("Widen conditions of explicit branches into dominating guards in " - "case if their taken frequency exceeds threshold set by " - "guard-widening-frequent-branch-threshold option"), - cl::init(false)); - -static cl::opt<unsigned> FrequentBranchThreshold( - "guard-widening-frequent-branch-threshold", cl::Hidden, - cl::desc("When WidenFrequentBranches is set to true, this option is used " - "to determine which branches are frequently taken. The criteria " - "that a branch is taken more often than " - "((FrequentBranchThreshold - 1) / FrequentBranchThreshold), then " - "it is considered frequently taken"), - cl::init(1000)); - static cl::opt<bool> WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden, cl::desc("Whether or not we should widen guards " @@ -97,15 +84,16 @@ static Value *getCondition(Instruction *I) { "Bad guard intrinsic?"); return GI->getArgOperand(0); } - if (isGuardAsWidenableBranch(I)) { - auto *Cond = cast<BranchInst>(I)->getCondition(); - return cast<BinaryOperator>(Cond)->getOperand(0); - } + Value *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB)) + return Cond; + return cast<BranchInst>(I)->getCondition(); } // Set the condition for \p I to \p NewCond. \p I can either be a guard or a -// conditional branch. +// conditional branch. static void setCondition(Instruction *I, Value *NewCond) { if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) { assert(GI->getIntrinsicID() == Intrinsic::experimental_guard && @@ -126,7 +114,6 @@ class GuardWideningImpl { DominatorTree &DT; PostDominatorTree *PDT; LoopInfo &LI; - BranchProbabilityInfo *BPI; /// Together, these describe the region of interest. This might be all of /// the blocks within a function, or only a given loop's blocks and preheader. @@ -271,26 +258,22 @@ class GuardWideningImpl { void widenGuard(Instruction *ToWiden, Value *NewCondition, bool InvertCondition) { Value *Result; + widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result, InvertCondition); - Value *WidenableCondition = nullptr; if (isGuardAsWidenableBranch(ToWiden)) { - auto *Cond = cast<BranchInst>(ToWiden)->getCondition(); - WidenableCondition = cast<BinaryOperator>(Cond)->getOperand(1); + setWidenableBranchCond(cast<BranchInst>(ToWiden), Result); + return; } - if (WidenableCondition) - Result = BinaryOperator::CreateAnd(Result, WidenableCondition, - "guard.chk", ToWiden); setCondition(ToWiden, Result); } public: explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT, - LoopInfo &LI, BranchProbabilityInfo *BPI, - DomTreeNode *Root, + LoopInfo &LI, DomTreeNode *Root, std::function<bool(BasicBlock*)> BlockFilter) - : DT(DT), PDT(PDT), LI(LI), BPI(BPI), Root(Root), BlockFilter(BlockFilter) + : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) {} /// The entry point for this pass. @@ -309,13 +292,6 @@ static bool isSupportedGuardInstruction(const Instruction *Insn) { bool GuardWideningImpl::run() { DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock; bool Changed = false; - Optional<BranchProbability> LikelyTaken = None; - if (WidenFrequentBranches && BPI) { - unsigned Threshold = FrequentBranchThreshold; - assert(Threshold > 0 && "Zero threshold makes no sense!"); - LikelyTaken = BranchProbability(Threshold - 1, Threshold); - } - for (auto DFI = df_begin(Root), DFE = df_end(Root); DFI != DFE; ++DFI) { auto *BB = (*DFI)->getBlock(); @@ -330,17 +306,6 @@ bool GuardWideningImpl::run() { for (auto *II : CurrentList) Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock); - if (WidenFrequentBranches && BPI) - if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) - if (BI->isConditional()) { - // If one of branches of a conditional is likely taken, try to - // eliminate it. - if (BPI->getEdgeProbability(BB, 0U) >= *LikelyTaken) - Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock); - else if (BPI->getEdgeProbability(BB, 1U) >= *LikelyTaken) - Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock, - /*InvertCondition*/true); - } } assert(EliminatedGuardsAndBranches.empty() || Changed); @@ -805,10 +770,7 @@ PreservedAnalyses GuardWideningPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = AM.getCachedResult<BranchProbabilityAnalysis>(F); - if (!GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(), + if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), [](BasicBlock*) { return true; } ).run()) return PreservedAnalyses::all(); @@ -820,22 +782,13 @@ PreservedAnalyses GuardWideningPass::run(Function &F, PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U) { - - const auto &FAM = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); - Function &F = *L.getHeader()->getParent(); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(F); - BasicBlock *RootBB = L.getLoopPredecessor(); if (!RootBB) RootBB = L.getHeader(); auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L.contains(BB); }; - if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, BPI, - AR.DT.getNode(RootBB), + if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB), BlockFilter).run()) return PreservedAnalyses::all(); @@ -856,10 +809,7 @@ struct GuardWideningLegacyPass : public FunctionPass { auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); - return GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(), + return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), [](BasicBlock*) { return true; } ).run(); } @@ -868,8 +818,6 @@ struct GuardWideningLegacyPass : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - if (WidenFrequentBranches) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); } }; @@ -895,16 +843,11 @@ struct LoopGuardWideningLegacyPass : public LoopPass { auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L->contains(BB); }; - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); - return GuardWideningImpl(DT, PDT, LI, BPI, + return GuardWideningImpl(DT, PDT, LI, DT.getNode(RootBB), BlockFilter).run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { - if (WidenFrequentBranches) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); AU.setPreservesCFG(); getLoopAnalysisUsage(AU); AU.addPreserved<PostDominatorTreeWrapperPass>(); @@ -920,8 +863,6 @@ INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards", INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -if (WidenFrequentBranches) - INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards", false, false) @@ -931,8 +872,6 @@ INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening", INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -if (WidenFrequentBranches) - INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening", "Widen guards (within a single loop, as a loop pass)", false, false) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 5519a00c12c9..d8d7acae5c9f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -31,8 +31,8 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -44,7 +44,6 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" @@ -68,6 +67,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -79,6 +79,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include <cassert> @@ -125,10 +126,9 @@ DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), cl::desc("Disable Linear Function Test Replace optimization")); static cl::opt<bool> -LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(false), +LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), cl::desc("Predicate conditions in read only loops")); - namespace { struct RewritePhi; @@ -2663,11 +2663,11 @@ static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE, // merge the max and exact information to approximate a version of // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. SmallVector<const SCEV*, 4> ExitCounts; - const SCEV *MaxConstEC = SE.getConstantMaxBackedgeTakenCount(L); - if (!isa<SCEVCouldNotCompute>(MaxConstEC)) - ExitCounts.push_back(MaxConstEC); for (BasicBlock *ExitingBB : ExitingBlocks) { const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount)) + ExitCount = SE.getExitCount(L, ExitingBB, + ScalarEvolution::ConstantMaximum); if (!isa<SCEVCouldNotCompute>(ExitCount)) { assert(DT.dominates(ExitingBB, L->getLoopLatch()) && "We should only have known counts for exiting blocks that " @@ -2835,6 +2835,10 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { !isSafeToExpand(ExactBTC, *SE)) return Changed; + // If we end up with a pointer exit count, bail. It may be unsized. + if (!ExactBTC->getType()->isIntegerTy()) + return Changed; + auto BadExit = [&](BasicBlock *ExitingBB) { // If our exiting block exits multiple loops, we can only rewrite the // innermost one. Otherwise, we're changing how many times the innermost @@ -2865,6 +2869,10 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { !isSafeToExpand(ExitCount, *SE)) return true; + // If we end up with a pointer exit count, bail. It may be unsized. + if (!ExitCount->getType()->isIntegerTy()) + return true; + return false; }; @@ -2936,7 +2944,7 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { // varying check. Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); IRBuilder<> B(L->getLoopPreheader()->getTerminator()); - Value *ExactBTCV = nullptr; //lazy generated if needed + Value *ExactBTCV = nullptr; // Lazily generated if needed. for (BasicBlock *ExitingBB : ExitingBlocks) { const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); @@ -2991,15 +2999,15 @@ bool IndVarSimplify::run(Loop *L) { if (!L->isLoopSimplifyForm()) return false; - // If there are any floating-point recurrences, attempt to - // transform them to use integer recurrences. - Changed |= rewriteNonIntegerIVs(L); - #ifndef NDEBUG // Used below for a consistency check only const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); #endif + // If there are any floating-point recurrences, attempt to + // transform them to use integer recurrences. + Changed |= rewriteNonIntegerIVs(L); + // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE, DL, "indvars"); #ifndef NDEBUG @@ -3026,11 +3034,19 @@ bool IndVarSimplify::run(Loop *L) { NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); // Try to eliminate loop exits based on analyzeable exit counts - Changed |= optimizeLoopExits(L, Rewriter); + if (optimizeLoopExits(L, Rewriter)) { + Changed = true; + // Given we've changed exit counts, notify SCEV + SE->forgetLoop(L); + } // Try to form loop invariant tests for loop exits by changing how many // iterations of the loop run when that is unobservable. - Changed |= predicateLoopExits(L, Rewriter); + if (predicateLoopExits(L, Rewriter)) { + Changed = true; + // Given we've changed exit counts, notify SCEV + SE->forgetLoop(L); + } // If we have a trip count expression, rewrite the loop's exit condition // using it. @@ -3118,7 +3134,8 @@ bool IndVarSimplify::run(Loop *L) { "Indvars did not preserve LCSSA!"); // Verify that LFTR, and any other change have not interfered with SCEV's - // ability to compute trip count. + // ability to compute trip count. We may have *changed* the exit count, but + // only by reducing it. #ifndef NDEBUG if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { SE->forgetLoop(L); @@ -3130,7 +3147,8 @@ bool IndVarSimplify::run(Loop *L) { else BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, NewBECount->getType()); - assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV"); + assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount, + NewBECount) && "indvars must preserve SCEV"); } #endif diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 997d68838152..58469749600e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -74,6 +74,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index e7e73a132fbe..dfb1b6bfb739 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -141,11 +141,11 @@ using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; /// InferAddressSpaces class InferAddressSpaces : public FunctionPass { - const TargetTransformInfo *TTI; + const TargetTransformInfo *TTI = nullptr; /// Target specific address space which uses of should be replaced if /// possible. - unsigned FlatAddrSpace; + unsigned FlatAddrSpace = 0; public: static char ID; @@ -791,8 +791,8 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); if (auto *MSI = dyn_cast<MemSetInst>(MI)) { - B.CreateMemSet(NewV, MSI->getValue(), - MSI->getLength(), MSI->getDestAlignment(), + B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), + MaybeAlign(MSI->getDestAlignment()), false, // isVolatile TBAA, ScopeMD, NoAliasMD); } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { @@ -808,15 +808,13 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, if (isa<MemCpyInst>(MTI)) { MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); - B.CreateMemCpy(Dest, MTI->getDestAlignment(), - Src, MTI->getSourceAlignment(), + B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile TBAA, TBAAStruct, ScopeMD, NoAliasMD); } else { assert(isa<MemMoveInst>(MTI)); - B.CreateMemMove(Dest, MTI->getDestAlignment(), - Src, MTI->getSourceAlignment(), + B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile TBAA, ScopeMD, NoAliasMD); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp index ec28f790f252..e8bbf2936da6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Local.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 0cf00baaa24a..98c2fcb3dae0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -55,6 +55,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" @@ -305,14 +306,13 @@ bool JumpThreading::runOnFunction(Function &F) { DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; - bool HasProfileData = F.hasProfileData(); - if (HasProfileData) { + if (F.hasProfileData()) { LoopInfo LI{DominatorTree(F)}; BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, HasProfileData, + bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (PrintLVIAfterJumpThreading) { dbgs() << "LVI for function '" << F.getName() << "':\n"; @@ -339,7 +339,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, HasProfileData, + bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (!Changed) @@ -1002,49 +1002,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // successor, merge the blocks. This encourages recursive jump threading // because now the condition in this block can be threaded through // predecessors of our predecessor block. - if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - const Instruction *TI = SinglePred->getTerminator(); - if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 && - SinglePred != BB && !hasAddressTakenAndUsed(BB)) { - // If SinglePred was a loop header, BB becomes one. - if (LoopHeaders.erase(SinglePred)) - LoopHeaders.insert(BB); - - LVI->eraseBlock(SinglePred); - MergeBasicBlockIntoOnlyPred(BB, DTU); - - // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by - // BB code within one basic block `BB`), we need to invalidate the LVI - // information associated with BB, because the LVI information need not be - // true for all of BB after the merge. For example, - // Before the merge, LVI info and code is as follows: - // SinglePred: <LVI info1 for %p val> - // %y = use of %p - // call @exit() // need not transfer execution to successor. - // assume(%p) // from this point on %p is true - // br label %BB - // BB: <LVI info2 for %p val, i.e. %p is true> - // %x = use of %p - // br label exit - // - // Note that this LVI info for blocks BB and SinglPred is correct for %p - // (info2 and info1 respectively). After the merge and the deletion of the - // LVI info1 for SinglePred. We have the following code: - // BB: <LVI info2 for %p val> - // %y = use of %p - // call @exit() - // assume(%p) - // %x = use of %p <-- LVI info2 is correct from here onwards. - // br label exit - // LVI info2 for BB is incorrect at the beginning of BB. - - // Invalidate LVI information for BB if the LVI is not provably true for - // all of BB. - if (!isGuaranteedToTransferExecutionToSuccessor(BB)) - LVI->eraseBlock(BB); - return true; - } - } + if (MaybeMergeBasicBlockIntoOnlyPred(BB)) + return true; if (TryToUnfoldSelectInCurrBB(BB)) return true; @@ -1758,7 +1717,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, getSuccessor(GetBestDestForJumpOnUndef(BB)); // Ok, try to thread it! - return ThreadEdge(BB, PredsToFactor, MostPopularDest); + return TryThreadEdge(BB, PredsToFactor, MostPopularDest); } /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on @@ -1920,12 +1879,146 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, } } -/// ThreadEdge - We have decided that it is safe and profitable to factor the -/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB -/// across BB. Transform the IR to reflect this change. -bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, - const SmallVectorImpl<BasicBlock *> &PredBBs, - BasicBlock *SuccBB) { +/// Merge basic block BB into its sole predecessor if possible. +bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) { + BasicBlock *SinglePred = BB->getSinglePredecessor(); + if (!SinglePred) + return false; + + const Instruction *TI = SinglePred->getTerminator(); + if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 || + SinglePred == BB || hasAddressTakenAndUsed(BB)) + return false; + + // If SinglePred was a loop header, BB becomes one. + if (LoopHeaders.erase(SinglePred)) + LoopHeaders.insert(BB); + + LVI->eraseBlock(SinglePred); + MergeBasicBlockIntoOnlyPred(BB, DTU); + + // Now that BB is merged into SinglePred (i.e. SinglePred code followed by + // BB code within one basic block `BB`), we need to invalidate the LVI + // information associated with BB, because the LVI information need not be + // true for all of BB after the merge. For example, + // Before the merge, LVI info and code is as follows: + // SinglePred: <LVI info1 for %p val> + // %y = use of %p + // call @exit() // need not transfer execution to successor. + // assume(%p) // from this point on %p is true + // br label %BB + // BB: <LVI info2 for %p val, i.e. %p is true> + // %x = use of %p + // br label exit + // + // Note that this LVI info for blocks BB and SinglPred is correct for %p + // (info2 and info1 respectively). After the merge and the deletion of the + // LVI info1 for SinglePred. We have the following code: + // BB: <LVI info2 for %p val> + // %y = use of %p + // call @exit() + // assume(%p) + // %x = use of %p <-- LVI info2 is correct from here onwards. + // br label exit + // LVI info2 for BB is incorrect at the beginning of BB. + + // Invalidate LVI information for BB if the LVI is not provably true for + // all of BB. + if (!isGuaranteedToTransferExecutionToSuccessor(BB)) + LVI->eraseBlock(BB); + return true; +} + +/// Update the SSA form. NewBB contains instructions that are copied from BB. +/// ValueMapping maps old values in BB to new ones in NewBB. +void JumpThreadingPass::UpdateSSA( + BasicBlock *BB, BasicBlock *NewBB, + DenseMap<Instruction *, Value *> &ValueMapping) { + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector<Use *, 16> UsesToRename; + + for (Instruction &I : *BB) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Use &U : I.uses()) { + Instruction *User = cast<Instruction>(U.getUser()); + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(U) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&U); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I.getType(), I.getName()); + SSAUpdate.AddAvailableValue(BB, &I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + LLVM_DEBUG(dbgs() << "\n"); + } +} + +/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone +/// arguments that come from PredBB. Return the map from the variables in the +/// source basic block to the variables in the newly created basic block. +DenseMap<Instruction *, Value *> +JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI, + BasicBlock::iterator BE, BasicBlock *NewBB, + BasicBlock *PredBB) { + // We are going to have to map operands from the source basic block to the new + // copy of the block 'NewBB'. If there are PHI nodes in the source basic + // block, evaluate them to account for entry from PredBB. + DenseMap<Instruction *, Value *> ValueMapping; + + // Clone the phi nodes of the source basic block into NewBB. The resulting + // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater + // might need to rewrite the operand of the cloned phi. + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { + PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); + NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); + ValueMapping[PN] = NewPN; + } + + // Clone the non-phi instructions of the source basic block into NewBB, + // keeping track of the mapping and using it to remap operands in the cloned + // instructions. + for (; BI != BE; ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + NewBB->getInstList().push_back(New); + ValueMapping[&*BI] = New; + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { + DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + } + + return ValueMapping; +} + +/// TryThreadEdge - Thread an edge if it's safe and profitable to do so. +bool JumpThreadingPass::TryThreadEdge( + BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs, + BasicBlock *SuccBB) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() @@ -1955,6 +2048,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, return false; } + ThreadEdge(BB, PredBBs, SuccBB); + return true; +} + +/// ThreadEdge - We have decided that it is safe and profitable to factor the +/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB +/// across BB. Transform the IR to reflect this change. +void JumpThreadingPass::ThreadEdge(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs, + BasicBlock *SuccBB) { + assert(SuccBB != BB && "Don't create an infinite loop"); + + assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) && + "Don't thread across loop headers"); + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) @@ -1968,7 +2076,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // And finally, do it! LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" << SuccBB->getName() - << "' with cost: " << JumpThreadCost << ", across block:\n " << *BB << "\n"); if (DTU->hasPendingDomTreeUpdates()) @@ -1977,11 +2084,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, LVI->enableDT(); LVI->threadEdge(PredBB, BB, SuccBB); - // We are going to have to map operands from the original BB block to the new - // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to - // account for entry from PredBB. - DenseMap<Instruction*, Value*> ValueMapping; - BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+".thread", BB->getParent(), BB); @@ -1994,32 +2096,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); } - BasicBlock::iterator BI = BB->begin(); - // Clone the phi nodes of BB into NewBB. The resulting phi nodes are trivial, - // since NewBB only has one predecessor, but SSAUpdater might need to rewrite - // the operand of the cloned phi. - for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { - PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); - NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); - ValueMapping[PN] = NewPN; - } - - // Clone the non-phi instructions of BB into NewBB, keeping track of the - // mapping and using it to remap operands in the cloned instructions. - for (; !BI->isTerminator(); ++BI) { - Instruction *New = BI->clone(); - New->setName(BI->getName()); - NewBB->getInstList().push_back(New); - ValueMapping[&*BI] = New; - - // Remap operands to patch up intra-block references. - for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) - if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { - DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); - if (I != ValueMapping.end()) - New->setOperand(i, I->second); - } - } + // Copy all the instructions from BB to NewBB except the terminator. + DenseMap<Instruction *, Value *> ValueMapping = + CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB); // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. @@ -2045,44 +2124,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, {DominatorTree::Insert, PredBB, NewBB}, {DominatorTree::Delete, PredBB, BB}}); - // If there were values defined in BB that are used outside the block, then we - // now have to update all uses of the value to use either the original value, - // the cloned value, or some PHI derived value. This can require arbitrary - // PHI insertion, of which we are prepared to do, clean these up now. - SSAUpdater SSAUpdate; - SmallVector<Use*, 16> UsesToRename; - - for (Instruction &I : *BB) { - // Scan all uses of this instruction to see if their uses are no longer - // dominated by the previous def and if so, record them in UsesToRename. - // Also, skip phi operands from PredBB - we'll remove them anyway. - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(U) == BB) - continue; - } else if (User->getParent() == BB) - continue; - - UsesToRename.push_back(&U); - } - - // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty()) - continue; - LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); - - // We found a use of I outside of BB. Rename all uses of I that are outside - // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks - // with the two values we know. - SSAUpdate.Initialize(I.getType(), I.getName()); - SSAUpdate.AddAvailableValue(BB, &I); - SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); - - while (!UsesToRename.empty()) - SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - LLVM_DEBUG(dbgs() << "\n"); - } + UpdateSSA(BB, NewBB, ValueMapping); // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This @@ -2094,7 +2136,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // Threaded an edge! ++NumThreads; - return true; } /// Create a new basic block that will be the predecessor of BB and successor of @@ -2366,43 +2407,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, ValueMapping); - // If there were values defined in BB that are used outside the block, then we - // now have to update all uses of the value to use either the original value, - // the cloned value, or some PHI derived value. This can require arbitrary - // PHI insertion, of which we are prepared to do, clean these up now. - SSAUpdater SSAUpdate; - SmallVector<Use*, 16> UsesToRename; - for (Instruction &I : *BB) { - // Scan all uses of this instruction to see if it is used outside of its - // block, and if so, record them in UsesToRename. - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(U) == BB) - continue; - } else if (User->getParent() == BB) - continue; - - UsesToRename.push_back(&U); - } - - // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty()) - continue; - - LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); - - // We found a use of I outside of BB. Rename all uses of I that are outside - // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks - // with the two values we know. - SSAUpdate.Initialize(I.getType(), I.getName()); - SSAUpdate.AddAvailableValue(BB, &I); - SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]); - - while (!UsesToRename.empty()) - SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - LLVM_DEBUG(dbgs() << "\n"); - } + UpdateSSA(BB, PredBB, ValueMapping); // PredBB no longer jumps to BB, remove entries in the PHI node for the edge // that we nuked. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp index 6ce4831a7359..8c33045c2380 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp @@ -63,6 +63,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/PredIteratorCache.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -137,7 +138,8 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, TargetTransformInfo *TTI, bool &FreeInLoop); static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); + MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); @@ -162,7 +164,7 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU); + MemorySSAUpdater *MSSAU, ScalarEvolution *SE); namespace { struct LoopInvariantCodeMotion { @@ -390,8 +392,9 @@ bool LoopInvariantCodeMotion::runOnLoop( CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); Flags.IsSink = false; if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, - CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); + Changed |= + hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -787,6 +790,41 @@ public: }; } // namespace + +/// Return true if we know how to rewrite all uses of the given alloca after +/// hoisting it out of the loop. The main concerns are a) potential captures +/// and b) invariant.start markers which don't capture, but are no longer +/// valid w/o a corresponding invariant.end. +static bool canRewriteUsesOfAlloca(AllocaInst &AI) { + // TODO: This looks a lot like capture tracking, but we need to remove any + // invariant starts if we extend the lifetime of the alloca by hoisting it. + // We should probably refactor capture tracking into a form which allows us + // to reuse the relevant bits and remove the duplicated logic here. + + SmallVector<Use *, 16> Worklist; + for (Use &U : AI.uses()) + Worklist.push_back(&U); + + unsigned NumUsesExplored = 0; + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast<Instruction>(U->getUser()); + NumUsesExplored++; + if (NumUsesExplored > DefaultMaxUsesToExplore) + return false; + // Non capturing, terminating uses + if (isa<LoadInst>(I) || + (isa<StoreInst>(I) && U->getOperandNo() == 1)) + continue; + // Non capturing, non-terminating + if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I)) + return false; + for (Use &U : I->uses()) + Worklist.push_back(&U); + } + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -795,7 +833,7 @@ public: bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, - ICFLoopSafetyInfo *SafetyInfo, + ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { // Verify inputs. @@ -855,7 +893,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -882,7 +920,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), - SafetyInfo, MSSAU, ORE); + SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(ReciprocalDivisor); Changed = true; continue; @@ -901,7 +939,17 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, CurLoop->hasLoopInvariantOperands(&I) && MustExecuteWithoutWritesBefore(I)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); + HoistedInstructions.push_back(&I); + Changed = true; + continue; + } + + if (isa<AllocaInst>(&I) && + SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + canRewriteUsesOfAlloca(cast<AllocaInst>(I))) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -915,7 +963,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, PN->setIncomingBlock( i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); Changed = true; continue; @@ -952,7 +1000,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, LLVM_DEBUG(dbgs() << "LICM rehoisting to " << HoistPoint->getParent()->getName() << ": " << *I << "\n"); - moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU); + moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE); HoistPoint = I; Changed = true; } @@ -1142,6 +1190,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // Assumes don't actually alias anything or throw return true; + if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>())) + // Widenable conditions don't actually alias anything or throw + return true; + // Handle simple cases by querying alias analysis. FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); if (Behavior == FMRB_DoesNotAccessMemory) @@ -1441,14 +1493,18 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, + ScalarEvolution *SE) { SafetyInfo.removeInstruction(&I); SafetyInfo.insertInstructionTo(&I, Dest.getParent()); I.moveBefore(&Dest); if (MSSAU) if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( MSSAU->getMemorySSA()->getMemoryAccess(&I))) - MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), MemorySSA::End); + MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), + MemorySSA::BeforeTerminator); + if (SE) + SE->forgetValue(&I); } static Instruction *sinkThroughTriviallyReplaceablePHI( @@ -1662,7 +1718,8 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, /// static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) { + MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I << "\n"); ORE->emit([&]() { @@ -1683,10 +1740,10 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, if (isa<PHINode>(I)) // Move the new node to the end of the phi list in the destination block. - moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU); + moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE); else // Move the new node to the destination block, before its terminator. - moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU); + moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE); // Apply line 0 debug locations when we are moving instructions to different // basic blocks because we want to avoid jumpy line tables. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index a972d6fa2fcd..ab65f56d088f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/InitializePasses.h" #define DEBUG_TYPE "loop-data-prefetch" #include "llvm/ADT/DepthFirstIterator.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index cee197cf8354..2451572d6171 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index f45e5fd0f50b..8e04e6e0ffe8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -55,6 +55,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 9f93c68e6128..e1738f08eb23 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -55,12 +55,15 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Function.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/CodeMoverUtils.h" using namespace llvm; @@ -88,6 +91,7 @@ STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); STATISTIC(NonIdenticalGuards, "Candidates have different guards"); STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); +STATISTIC(NotRotated, "Candidate is not rotated"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -163,14 +167,8 @@ struct FusionCandidate { const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), - Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr), - DT(DT), PDT(PDT), ORE(ORE) { - - // TODO: This is temporary while we fuse both rotated and non-rotated - // loops. Once we switch to only fusing rotated loops, the initialization of - // GuardBranch can be moved into the initialization list above. - if (isRotated()) - GuardBranch = L->getLoopGuardBranch(); + Latch(L->getLoopLatch()), L(L), Valid(true), + GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) { // Walk over all blocks in the loop and check for conditions that may // prevent fusion. For each block, walk over all instructions and collect @@ -257,15 +255,14 @@ struct FusionCandidate { : GuardBranch->getSuccessor(0); } - bool isRotated() const { - assert(L && "Expecting loop to be valid."); - assert(Latch && "Expecting latch to be valid."); - return L->isLoopExiting(Latch); - } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const { - dbgs() << "\tGuardBranch: " + dbgs() << "\tGuardBranch: "; + if (GuardBranch) + dbgs() << *GuardBranch; + else + dbgs() << "nullptr"; + dbgs() << "\n" << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") << "\n" @@ -316,6 +313,11 @@ struct FusionCandidate { return reportInvalidCandidate(NotSimplifiedForm); } + if (!L->isRotatedForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); + return reportInvalidCandidate(NotRotated); + } + return true; } @@ -591,16 +593,8 @@ private: const FusionCandidate &FC1) const { assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); - BasicBlock *FC0EntryBlock = FC0.getEntryBlock(); - BasicBlock *FC1EntryBlock = FC1.getEntryBlock(); - - if (DT.dominates(FC0EntryBlock, FC1EntryBlock)) - return PDT.dominates(FC1EntryBlock, FC0EntryBlock); - - if (DT.dominates(FC1EntryBlock, FC0EntryBlock)) - return PDT.dominates(FC0EntryBlock, FC1EntryBlock); - - return false; + return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), + DT, PDT); } /// Iterate over all loops in the given loop set and identify the loops that @@ -1113,6 +1107,29 @@ private: return FC.ExitBlock->size() == 1; } + /// Simplify the condition of the latch branch of \p FC to true, when both of + /// its successors are the same. + void simplifyLatchBranch(const FusionCandidate &FC) const { + BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); + if (FCLatchBranch) { + assert(FCLatchBranch->isConditional() && + FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && + "Expecting the two successors of FCLatchBranch to be the same"); + FCLatchBranch->setCondition( + llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType())); + } + } + + /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique + /// successor, then merge FC0.Latch with its unique successor. + void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { + moveInstsBottomUp(*FC0.Latch, *FC1.Latch, DT, PDT, DI); + if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { + MergeBlockIntoPredecessor(Succ, &DTU, &LI); + DTU.flush(); + } + } + /// Fuse two fusion candidates, creating a new fused loop. /// /// This method contains the mechanics of fusing two loops, represented by \p @@ -1246,6 +1263,10 @@ private: FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + // Change the condition of FC0 latch branch to true, as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + // If FC0.Latch and FC0.ExitingBlock are the same then we have already // performed the updates above. if (FC0.Latch != FC0.ExitingBlock) @@ -1268,9 +1289,15 @@ private: // Is there a way to keep SE up-to-date so we don't need to forget the loops // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), FC1.L->block_end()); @@ -1490,6 +1517,10 @@ private: FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + // Change the condition of FC0 latch branch to true, as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + // If FC0.Latch and FC0.ExitingBlock are the same then we have already // performed the updates above. if (FC0.Latch != FC0.ExitingBlock) @@ -1521,9 +1552,15 @@ private: // Is there a way to keep SE up-to-date so we don't need to forget the loops // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), FC1.L->block_end()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index dd477e800693..b77843d7cd71 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -41,7 +41,6 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -78,20 +77,17 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" -#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -107,7 +103,6 @@ using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); -STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare"); static cl::opt<bool> UseLIRCodeSizeHeurs( "use-lir-code-size-heurs", @@ -117,26 +112,6 @@ static cl::opt<bool> UseLIRCodeSizeHeurs( namespace { -// FIXME: reinventing the wheel much? Is there a cleaner solution? -struct PMAbstraction { - virtual void markLoopAsDeleted(Loop *L) = 0; - virtual ~PMAbstraction() = default; -}; -struct LegacyPMAbstraction : PMAbstraction { - LPPassManager &LPM; - LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {} - virtual ~LegacyPMAbstraction() = default; - void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); } -}; -struct NewPMAbstraction : PMAbstraction { - LPMUpdater &Updater; - NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {} - virtual ~NewPMAbstraction() = default; - void markLoopAsDeleted(Loop *L) override { - Updater.markLoopAsDeleted(*L, L->getName()); - } -}; - class LoopIdiomRecognize { Loop *CurLoop = nullptr; AliasAnalysis *AA; @@ -146,7 +121,6 @@ class LoopIdiomRecognize { TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; const DataLayout *DL; - PMAbstraction &LoopDeleter; OptimizationRemarkEmitter &ORE; bool ApplyCodeSizeHeuristics; @@ -155,10 +129,9 @@ public: LoopInfo *LI, ScalarEvolution *SE, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, - const DataLayout *DL, PMAbstraction &LoopDeleter, + const DataLayout *DL, OptimizationRemarkEmitter &ORE) - : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), - LoopDeleter(LoopDeleter), ORE(ORE) {} + : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {} bool runOnLoop(Loop *L); @@ -172,8 +145,6 @@ private: bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; - bool HasMemCmp; - bool HasBCmp; /// Return code for isLegalStore() enum LegalStoreKind { @@ -201,7 +172,7 @@ private: bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, Value *StoredVal, + MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, @@ -216,32 +187,6 @@ private: bool runOnNoncountableLoop(); - struct CmpLoopStructure { - Value *BCmpValue, *LatchCmpValue; - BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB; - BasicBlock *LatchBrFinishBB, *LatchBrContinueBB; - }; - bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const; - struct CmpOfLoads { - ICmpInst::Predicate BCmpPred; - Value *LoadSrcA, *LoadSrcB; - Value *LoadA, *LoadB; - }; - bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const; - bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads, - CmpLoopStructure &CmpLoop) const; - bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads, - const SCEV *&SrcA, const SCEV *&SrcB, - const SCEV *&Iterations) const; - bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst, - LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA, - const SCEV *&SrcB, const SCEV *&NBytes) const; - BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual); - void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst, - LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA, - const SCEV *SrcB, const SCEV *NBytes); - bool recognizeBCmp(); - bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); @@ -279,14 +224,13 @@ public: &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()); const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); - LegacyPMAbstraction LoopDeleter(LPM); // For the old PM, we can't use OptimizationRemarkEmitter as an analysis // pass. Function analyses need to be preserved across loop transformations // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); - LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE); + LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE); return LIR.runOnLoop(L); } @@ -305,7 +249,7 @@ char LoopIdiomRecognizeLegacyPass::ID = 0; PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, - LPMUpdater &Updater) { + LPMUpdater &) { const auto *DL = &L.getHeader()->getModule()->getDataLayout(); const auto &FAM = @@ -319,9 +263,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached " "at a higher level"); - NewPMAbstraction LoopDeleter(Updater); LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL, - LoopDeleter, *ORE); + *ORE); if (!LIR.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -358,8 +301,7 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); - if (Name == "memset" || Name == "memcpy" || Name == "memcmp" || - Name == "bcmp") + if (Name == "memset" || Name == "memcpy") return false; // Determine if code size heuristics need to be applied. @@ -369,10 +311,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { HasMemset = TLI->has(LibFunc_memset); HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); HasMemcpy = TLI->has(LibFunc_memcpy); - HasMemCmp = TLI->has(LibFunc_memcmp); - HasBCmp = TLI->has(LibFunc_bcmp); - if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp) + if (HasMemset || HasMemsetPattern || HasMemcpy) if (SE->hasLoopInvariantBackedgeTakenCount(L)) return runOnCountableLoop(); @@ -791,7 +731,8 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, bool NegStride = StoreSize == -Stride; - if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), + if (processLoopStridedStore(StorePtr, StoreSize, + MaybeAlign(HeadStore->getAlignment()), StoredVal, HeadStore, AdjacentStores, StoreEv, BECount, NegStride)) { TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); @@ -846,9 +787,9 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, SmallPtrSet<Instruction *, 1> MSIs; MSIs.insert(MSI); bool NegStride = SizeInBytes == -Stride; - return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getDestAlignment(), SplatValue, MSI, MSIs, - Ev, BECount, NegStride, /*IsLoopMemset=*/true); + return processLoopStridedStore( + Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()), + SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -938,7 +879,7 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( - Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, + Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride, bool IsLoopMemset) { @@ -960,12 +901,12 @@ bool LoopIdiomRecognize::processLoopStridedStore( SCEVExpander Expander(*SE, *DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); - Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); + Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); const SCEV *Start = Ev->getStart(); // Handle negative strided loops. if (NegStride) - Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); + Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -993,7 +934,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, everything looks good, insert the memset. const SCEV *NumBytesS = - getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -1001,12 +942,12 @@ bool LoopIdiomRecognize::processLoopStridedStore( return false; Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); CallInst *NewCall; if (SplatValue) { - NewCall = - Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); + NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, + MaybeAlign(StoreAlignment)); } else { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; @@ -1014,7 +955,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Module *M = TheStore->getModule(); StringRef FuncName = "memset_pattern16"; FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), - Int8PtrTy, Int8PtrTy, IntPtr); + Int8PtrTy, Int8PtrTy, IntIdxTy); inferLibFuncAttributes(M, FuncName, *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -1081,11 +1022,11 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *StrStart = StoreEv->getStart(); unsigned StrAS = SI->getPointerAddressSpace(); - Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); + Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); // Handle negative strided loops. if (NegStride) - StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); + StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1111,7 +1052,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Handle negative strided loops. if (NegStride) - LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); + LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE); // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. @@ -1133,18 +1074,18 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Okay, everything is safe, we can transform this! const SCEV *NumBytesS = - getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); CallInst *NewCall = nullptr; // Check whether to generate an unordered atomic memcpy: // If the load or store are atomic, then they must necessarily be unordered // by previous checks. if (!SI->isAtomic() && !LI->isAtomic()) - NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(), - LoadBasePtr, LI->getAlignment(), NumBytes); + NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr, + LI->getAlign(), NumBytes); else { // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. @@ -1211,7 +1152,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() { << "] Noncountable Loop %" << CurLoop->getHeader()->getName() << "\n"); - return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS(); + return recognizePopcount() || recognizeAndInsertFFS(); } /// Check if the given conditional branch is based on the comparison between @@ -1885,811 +1826,3 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, // loop. The loop would otherwise not be deleted even if it becomes empty. SE->forgetLoop(CurLoop); } - -bool LoopIdiomRecognize::matchBCmpLoopStructure( - CmpLoopStructure &CmpLoop) const { - ICmpInst::Predicate BCmpPred; - - // We are looking for the following basic layout: - // PreheaderBB: <preheader> ; preds = ??? - // <...> - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB - // <...> - // %BCmpValue = icmp <...> - // br i1 %BCmpValue, label %LoopLatchBB, label %Successor0 - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // %LatchCmpValue = <are we done, or do next iteration?> - // br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB - // Successor0: <exit> ; preds = %LoopHeaderBB - // <...> - // Successor1: <exit> ; preds = %LoopLatchBB - // <...> - // - // Successor0 and Successor1 may or may not be the same basic block. - - // Match basic frame-work of this supposedly-comparison loop. - using namespace PatternMatch; - if (!match(CurLoop->getHeader()->getTerminator(), - m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()), - m_Value(CmpLoop.BCmpValue)), - CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) || - !match(CurLoop->getLoopLatch()->getTerminator(), - m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)), - CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) { - LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n"); - return true; -} - -bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue, - CmpOfLoads &CmpOfLoads) const { - using namespace PatternMatch; - LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue - << " as bcmp pattern.\n"); - - // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example: - // %v0 = load <...>, <...>* %LoadSrcA - // %v1 = load <...>, <...>* %LoadSrcB - // %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1 - // There won't be any no-op bitcasts between load and icmp, - // they would have been transformed into a load of bitcast. - // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too. - if (!match(BCmpValue, - m_ICmp(CmpOfLoads.BCmpPred, - m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)), - m_Value(CmpOfLoads.LoadA)), - m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)), - m_Value(CmpOfLoads.LoadB)))) || - !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) { - LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t" - << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB - << "\n"); - // FIXME: handle memcmp pattern? - return true; -} - -bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow( - const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const { - BasicBlock *LoopHeaderBB = CurLoop->getHeader(); - BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); - - // Be wary, comparisons can be inverted, canonicalize order. - // If this 'element' comparison passed, we expect to proceed to the next elt. - if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ) - std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB); - // The predicate on loop latch does not matter, just canonicalize some order. - if (CmpLoop.LatchBrContinueBB != LoopHeaderBB) - std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB); - - SmallVector<BasicBlock *, 2> ExitBlocks; - - CurLoop->getUniqueExitBlocks(ExitBlocks); - assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks."); - - // Check that control-flow between blocks is as expected. - if (CmpLoop.HeaderBrEqualBB != LoopLatchBB || - CmpLoop.LatchBrContinueBB != LoopHeaderBB || - !is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) || - !is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB)) { - LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n"); - return false; - } - - assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) && - !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) && - "Unexpected exit edges."); - - LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n"); - - LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n"); - assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here."); - // No loop instructions must be used outside of the loop. Since we are in - // LCSSA form, we only need to check successor block's PHI nodes's incoming - // values for incoming blocks that are the loop basic blocks. - for (const BasicBlock *ExitBB : ExitBlocks) { - for (const PHINode &PHI : ExitBB->phis()) { - for (const BasicBlock *LoopBB : - make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) { - return CurLoop->contains(PredecessorBB); - })) { - const auto *I = - dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB)); - if (I && CurLoop->contains(I)) { - LLVM_DEBUG(dbgs() - << "Loop contains instruction " << *I - << " which is used outside of the loop in basic block " - << ExitBB->getName() << " in phi node " << PHI << "\n"); - return false; - } - } - } - } - // Similarly, the loop should not have any other observable side-effects - // other than the final comparison result. - for (BasicBlock *LoopBB : CurLoop->blocks()) { - for (Instruction &I : *LoopBB) { - if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo. - continue; // FIXME: anything else? lifetime info? - if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) && - &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) { - LLVM_DEBUG( - dbgs() << "Loop contains instruction with potential side-effects: " - << I << "\n"); - return false; - } - } - } - LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n"); - return true; -} - -bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, - CmpOfLoads &CmpOfLoads, - const SCEV *&SrcA, - const SCEV *&SrcB, - const SCEV *&Iterations) const { - // Try to compute SCEV of the loads, for this loop's scope. - const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>( - SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop)); - const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>( - SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop)); - if (!ScevForSrcA || !ScevForSrcB) { - LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t" - << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n"); - - // Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB> - const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE); - const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE); - if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() || - ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop || - RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) || - cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) { - LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support " - "affine SCEV expressions originating in the loop we " - "are analysing with identical constant positive step, " - "equal to the count of bytes compared. Got:\n\t" - << *RecStepForA << "\n\t" << *RecStepForB << "\n"); - return false; - // FIXME: can support BCmpTyBytes > Step. - // But will need to account for the extra bytes compared at the end. - } - - SrcA = ScevForSrcA->getStart(); - SrcB = ScevForSrcB->getStart(); - LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA - << "\n\t" << *SrcB << "\n"); - - // The load sources must be loop-invants that dominate the loop header. - if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() || - !SE->isAvailableAtLoopEntry(SrcA, CurLoop) || - !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) { - LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable " - "prior to loop header.\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n"); - - // bcmp / memcmp take length argument as size_t, so let's conservatively - // assume that the iteration count should be not wider than that. - Type *CmpFuncSizeTy = DL->getIntPtrType(SE->getContext()); - - // For how many iterations is loop guaranteed not to exit via LoopLatch? - // This is one less than the maximal number of comparisons,and is: n + -1 - const SCEV *LoopExitCount = - SE->getExitCount(CurLoop, CurLoop->getLoopLatch()); - LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: " - << *LoopExitCount << "\n"); - // Exit count, similarly, must be loop-invant that dominates the loop header. - if (LoopExitCount == SE->getCouldNotCompute() || - !LoopExitCount->getType()->isIntOrPtrTy() || - LoopExitCount->getType()->getScalarSizeInBits() > - CmpFuncSizeTy->getScalarSizeInBits() || - !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) { - LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n"); - return false; - } - - // LoopExitCount is always one less than the actual count of iterations. - // Do this before cast, else we will be stuck with 1 + zext(-1 + n) - Iterations = SE->getAddExpr( - LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW); - assert(Iterations != SE->getCouldNotCompute() && - "Shouldn't fail to increment by one."); - - LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n"); - return true; -} - -/// Return true iff the bcmp idiom is detected in the loop. -/// -/// Additionally: -/// 1) \p BCmpInst is set to the root byte-comparison instruction. -/// 2) \p LatchCmpInst is set to the comparison that controls the latch. -/// 3) \p LoadA is set to the first LoadInst. -/// 4) \p LoadB is set to the second LoadInst. -/// 5) \p SrcA is set to the first source location that is being compared. -/// 6) \p SrcB is set to the second source location that is being compared. -/// 7) \p NBytes is set to the number of bytes to compare. -bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst, - CmpInst *&LatchCmpInst, - LoadInst *&LoadA, LoadInst *&LoadB, - const SCEV *&SrcA, const SCEV *&SrcB, - const SCEV *&NBytes) const { - LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n"); - - // Give up if the loop is not in normal form, or has more than 2 blocks. - if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) { - LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n"); - - CmpLoopStructure CmpLoop; - if (!matchBCmpLoopStructure(CmpLoop)) - return false; - - CmpOfLoads CmpOfLoads; - if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads)) - return false; - - if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop)) - return false; - - BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue); // FIXME: is there no - LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine - LoadA = cast<LoadInst>(CmpOfLoads.LoadA); // these cast with - LoadB = cast<LoadInst>(CmpOfLoads.LoadB); // m_Value() matcher? - - Type *BCmpValTy = BCmpInst->getOperand(0)->getType(); - LLVMContext &Context = BCmpValTy->getContext(); - uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy); - static constexpr uint64_t ByteTyBits = 8; - - LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy - << " of size " << BCmpTyBits - << " bits (while byte = " << ByteTyBits << " bits).\n"); - // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check - // that we are dealing with a multiple of a byte here. - if (BCmpTyBits % ByteTyBits != 0) { - LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n"); - return false; - // FIXME: could still be done under a run-time check that the total bit - // count is a multiple of a byte i guess? Or handle remainder separately? - } - - // Each comparison is done on this many bytes. - uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits; - LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes - << " bytes, eligible for bcmp conversion.\n"); - - const SCEV *Iterations; - if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations)) - return false; - - // bcmp / memcmp take length argument as size_t, do promotion now. - Type *CmpFuncSizeTy = DL->getIntPtrType(Context); - Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy); - assert(Iterations != SE->getCouldNotCompute() && "Promotion failed."); - // Note that it didn't do ptrtoint cast, we will need to do it manually. - - // We will be comparing *bytes*, not BCmpTy, we need to recalculate size. - // It's a multiplication, and it *could* overflow. But for it to overflow - // we'd want to compare more bytes than could be represented by size_t, But - // allocation functions also take size_t. So how'd you produce such buffer? - // FIXME: we likely need to actually check that we know this won't overflow, - // via llvm::computeOverflowForUnsignedMul(). - NBytes = SE->getMulExpr( - Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW); - assert(NBytes != SE->getCouldNotCompute() && - "Shouldn't fail to increment by one."); - - LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n"); - - if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() || - LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() || - !LoadB->isSimple()) { - StringLiteral L("Unsupported loads in idiom - only support identical, " - "simple loads from address space 0.\n"); - LLVM_DEBUG(dbgs() << L); - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads", - BCmpInst->getDebugLoc(), - CurLoop->getHeader()) - << L; - }); - return false; // FIXME: support non-simple loads. - } - - LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n"); - ORE.emit([&]() { - return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom", - CurLoop->getStartLoc(), - CurLoop->getHeader()) - << "Loop recognized as a bcmp idiom"; - }); - - return true; -} - -BasicBlock * -LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) { - LLVM_DEBUG(dbgs() << "Transforming control-flow.\n"); - SmallVector<DominatorTree::UpdateType, 8> DTUpdates; - - BasicBlock *PreheaderBB = CurLoop->getLoopPreheader(); - BasicBlock *HeaderBB = CurLoop->getHeader(); - BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); - SmallString<32> LoopName = CurLoop->getName(); - Function *Func = PreheaderBB->getParent(); - LLVMContext &Context = Func->getContext(); - - // Before doing anything, drop SCEV info. - SE->forgetLoop(CurLoop); - - // Here we start with: (0/6) - // PreheaderBB: <preheader> ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB - // <...> - // br i1 %<...>, label %LoopLatchBB, label %Successor0BB - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB - // Successor0BB: <exit> ; preds = %LoopHeaderBB - // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] - // <...> - // Successor1BB: <exit> ; preds = %LoopLatchBB - // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] - // <...> - // - // Successor0 and Successor1 may or may not be the same basic block. - - // Decouple the edge between loop preheader basic block and loop header basic - // block. Thus the loop has become unreachable. - assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() && - PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB && - "Preheader bb must end with an unconditional branch to header bb."); - PreheaderBB->getTerminator()->eraseFromParent(); - DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB}); - - // Create a new preheader basic block before loop header basic block. - auto *PhonyPreheaderBB = BasicBlock::Create( - Context, LoopName + ".phonypreheaderbb", Func, HeaderBB); - // And insert an unconditional branch from phony preheader basic block to - // loop header basic block. - IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB); - DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); - - // Create a *single* new empty block that we will substitute as a - // successor basic block for the loop's exits. This one is temporary. - // Much like phony preheader basic block, it is not connected. - auto *PhonySuccessorBB = - BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func, - LoopLatchBB->getNextNode()); - // That block must have *some* non-PHI instruction, or else deleteDeadLoop() - // will mess up cleanup of dbginfo, and verifier will complain. - IRBuilder<>(PhonySuccessorBB).CreateUnreachable(); - - // Create two new empty blocks that we will use to preserve the original - // loop exit control-flow, and preserve the incoming values in the PHI nodes - // in loop's successor exit blocks. These will live one. - auto *ComparedUnequalBB = - BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func, - PhonySuccessorBB->getNextNode()); - auto *ComparedEqualBB = - BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func, - PhonySuccessorBB->getNextNode()); - - // By now we have: (1/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // [no terminator instruction!] - // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB - // <...> - // br i1 %<...>, label %LoopLatchBB, label %Successor0BB - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB - // PhonySuccessorBB: ; No preds, UNREACHABLE! - // unreachable - // EqualBB: ; No preds, UNREACHABLE! - // [no terminator instruction!] - // UnequalBB: ; No preds, UNREACHABLE! - // [no terminator instruction!] - // Successor0BB: <exit> ; preds = %LoopHeaderBB - // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] - // <...> - // Successor1BB: <exit> ; preds = %LoopLatchBB - // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] - // <...> - - // What is the mapping/replacement basic block for exiting out of the loop - // from either of old's loop basic blocks? - auto GetReplacementBB = [this, ComparedEqualBB, - ComparedUnequalBB](const BasicBlock *OldBB) { - assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks."); - if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal". - return ComparedEqualBB; - if (OldBB == CurLoop->getHeader()) // "element compared unequal". - return ComparedUnequalBB; - llvm_unreachable("Only had two basic blocks in loop."); - }; - - // What are the exits out of this loop? - SmallVector<Loop::Edge, 2> LoopExitEdges; - CurLoop->getExitEdges(LoopExitEdges); - assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges."); - - // Populate new basic blocks, update the exiting control-flow, PHI nodes. - for (const Loop::Edge &Edge : LoopExitEdges) { - auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first); - auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second); - assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) && - "Unexpected edge."); - - // If we would exit the loop from this loop's basic block, - // what semantically would that mean? Did comparison succeed or fail? - BasicBlock *NewBB = GetReplacementBB(OldLoopBB); - assert(NewBB->empty() && "Should not get same new basic block here twice."); - IRBuilder<> Builder(NewBB); - Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc()); - Builder.CreateBr(SuccessorBB); - DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB}); - // Also, be *REALLY* careful with PHI nodes in successor basic block, - // update them to recieve the same input value, but not from current loop's - // basic block, but from new basic block instead. - SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB); - // Also, change loop control-flow. This loop's basic block shall no longer - // exit from the loop to it's original successor basic block, but to our new - // phony successor basic block. Note that new successor will be unique exit. - OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB, - PhonySuccessorBB); - DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB}); - DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB}); - } - - // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date. - assert(DTUpdates.size() == 8 && "Update count prediction failed."); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); - DTU.applyUpdates(DTUpdates); - DTUpdates.clear(); - - // By now we have: (2/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // [no terminator instruction!] - // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! - // br label %LoopHeaderBB - // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB - // <...> - // br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB - // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB - // <...> - // br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB - // PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB - // unreachable - // EqualBB: ; No preds, UNREACHABLE! - // br label %Successor1BB - // UnequalBB: ; No preds, UNREACHABLE! - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // *Finally*, zap the original loop. Record it's parent loop though. - Loop *ParentLoop = CurLoop->getParentLoop(); - LLVM_DEBUG(dbgs() << "Deleting old loop.\n"); - LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting! - deleteDeadLoop(CurLoop, DT, SE, LI); // And actually delete the loop. - CurLoop = nullptr; - - // By now we have: (3/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // [no terminator instruction!] - // PhonyPreheaderBB: ; No preds, UNREACHABLE! - // br label %PhonySuccessorBB - // PhonySuccessorBB: ; preds = %PhonyPreheaderBB - // unreachable - // EqualBB: ; No preds, UNREACHABLE! - // br label %Successor1BB - // UnequalBB: ; No preds, UNREACHABLE! - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // Now, actually restore the CFG. - - // Insert an unconditional branch from an actual preheader basic block to - // phony preheader basic block. - IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB); - DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); - // Insert proper conditional branch from phony successor basic block to the - // "dispatch" basic blocks, which were used to preserve incoming values in - // original loop's successor basic blocks. - assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) && - "Yep, that's the one we created to keep deleteDeadLoop() happy."); - PhonySuccessorBB->getTerminator()->eraseFromParent(); - { - IRBuilder<> Builder(PhonySuccessorBB); - Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc()); - Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB); - } - DTUpdates.push_back( - {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB}); - DTUpdates.push_back( - {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB}); - - BasicBlock *DispatchBB = PhonySuccessorBB; - DispatchBB->setName(LoopName + ".bcmpdispatchbb"); - - assert(DTUpdates.size() == 3 && "Update count prediction failed."); - DTU.applyUpdates(DTUpdates); - DTUpdates.clear(); - - // By now we have: (4/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %PhonyPreheaderBB - // PhonyPreheaderBB: ; preds = %PreheaderBB - // br label %DispatchBB - // DispatchBB: ; preds = %PhonyPreheaderBB - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: ; preds = %DispatchBB - // br label %Successor1BB - // UnequalBB: ; preds = %DispatchBB - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // The basic CFG has been restored! Now let's merge redundant basic blocks. - - // Merge phony successor basic block into it's only predecessor, - // phony preheader basic block. It is fully pointlessly redundant. - MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); - - // By now we have: (5/6) - // PreheaderBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %DispatchBB - // DispatchBB: ; preds = %PreheaderBB - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: ; preds = %DispatchBB - // br label %Successor1BB - // UnequalBB: ; preds = %DispatchBB - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // Was this loop nested? - if (!ParentLoop) { - // If the loop was *NOT* nested, then let's also merge phony successor - // basic block into it's only predecessor, preheader basic block. - // Also, here we need to update LoopInfo. - LI->removeBlock(PreheaderBB); - MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); - - // By now we have: (6/6) - // DispatchBB: ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: ; preds = %DispatchBB - // br label %Successor1BB - // UnequalBB: ; preds = %DispatchBB - // br label %Successor0BB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - return DispatchBB; - } - - // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop. - // To achieve that, we shall keep the preheader basic block (mainly so that - // the loop header block will be guaranteed to have a predecessor outside of - // the loop), and create a phony loop with all these new three basic blocks. - Loop *PhonyLoop = LI->AllocateLoop(); - ParentLoop->addChildLoop(PhonyLoop); - PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI); - PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI); - PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI); - - // But we only have a preheader basic block, a header basic block block and - // two exiting basic blocks. For a proper loop we also need a backedge from - // non-header basic block to header bb. - // Let's just add a never-taken branch from both of the exiting basic blocks. - for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) { - BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator()); - assert(OldTerminator->isUnconditional() && "That's the one we created."); - BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0); - - IRBuilder<> Builder(OldTerminator); - Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc()); - Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB, - DispatchBB); - OldTerminator->eraseFromParent(); - // Yes, the backedge will never be taken. The control-flow is redundant. - // If it can be simplified further, other passes will take care. - DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB}); - DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB}); - DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB}); - } - assert(DTUpdates.size() == 6 && "Update count prediction failed."); - DTU.applyUpdates(DTUpdates); - DTUpdates.clear(); - - // By now we have: (6/6) - // PreheaderBB: <preheader> ; preds = ??? - // <...> - // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) - // %ComparedEqual = icmp eq <...> %memcmp, 0 - // br label %BCmpDispatchBB - // BCmpDispatchBB: <header> ; preds = %PreheaderBB - // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB - // EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB - // br i1 %true, label %Successor1BB, label %BCmpDispatchBB - // UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB - // br i1 %true, label %Successor0BB, label %BCmpDispatchBB - // Successor0BB: ; preds = %UnequalBB - // %S0PHI = phi <...> [ <...>, %UnequalBB ] - // <...> - // Successor1BB: ; preds = %EqualBB - // %S0PHI = phi <...> [ <...>, %EqualBB ] - // <...> - - // Finally fully DONE! - return DispatchBB; -} - -void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst, - CmpInst *LatchCmpInst, - LoadInst *LoadA, LoadInst *LoadB, - const SCEV *SrcA, const SCEV *SrcB, - const SCEV *NBytes) { - // We will be inserting before the terminator instruction of preheader block. - IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator()); - - LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n"); - LLVM_DEBUG(dbgs() << "Emitting new instructions.\n"); - - // Expand the SCEV expressions for both sources to compare, and produce value - // for the byte len (beware of Iterations potentially being a pointer, and - // account for element size being BCmpTyBytes bytes, which may be not 1 byte) - Value *PtrA, *PtrB, *Len; - { - SCEVExpander SExp(*SE, *DL, "LoopToBCmp"); - SExp.setInsertPoint(&*Builder.GetInsertPoint()); - - auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) { - SExp.SetCurrentDebugLocation(DebugLoc()); - // If the pointer operand of original load had dbgloc - use it. - if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand())) - SExp.SetCurrentDebugLocation(I->getDebugLoc()); - return SExp.expandCodeFor(Src); - }; - PtrA = HandlePtr(LoadA, SrcA); - PtrB = HandlePtr(LoadB, SrcB); - - // For len calculation let's use dbgloc for the loop's latch condition. - Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); - SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); - Len = SExp.expandCodeFor(NBytes); - - Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext()); - assert(SE->getTypeSizeInBits(Len->getType()) == - DL->getTypeSizeInBits(CmpFuncSizeTy) && - "Len should already have the correct size."); - - // Make sure that iteration count is a number, insert ptrtoint cast if not. - if (Len->getType()->isPointerTy()) - Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy); - assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now."); - - Len->setName(Len->getName() + ".bytecount"); - - // There is no legality check needed. We want to compare that the memory - // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal. - // For them to be fully equal, they must match bit-by-bit. And likewise, - // for them to *NOT* be fully equal, they have to differ just by one bit. - // The step of comparison (bits compared at once) simply does not matter. - } - - // For the rest of new instructions, dbgloc should point at the value cmp. - Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc()); - - // Emit the comparison itself. - auto *CmpCall = - cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI) - : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI)); - // FIXME: add {B,Mem}CmpInst with MemoryCompareInst - // (based on MemIntrinsicBase) as base? - // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...) - - // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal. - auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ( - CmpCall, ConstantInt::get(CmpCall->getType(), 0), - PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp")); - - BasicBlock *BB = transformBCmpControlFlow(ComparedEqual); - Builder.ClearInsertionPoint(); - - // We're done. - LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n"); - ORE.emit([&]() { - return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall", - CmpCall->getDebugLoc(), BB) - << "Transformed bcmp idiom into a call to " - << ore::NV("NewFunction", CmpCall->getCalledFunction()) - << "() function"; - }); - ++NumBCmp; -} - -/// Recognizes a bcmp idiom in a non-countable loop. -/// -/// If detected, transforms the relevant code to issue the bcmp (or memcmp) -/// intrinsic function call, and returns true; otherwise, returns false. -bool LoopIdiomRecognize::recognizeBCmp() { - if (!HasMemCmp && !HasBCmp) - return false; - - ICmpInst *BCmpInst; - CmpInst *LatchCmpInst; - LoadInst *LoadA, *LoadB; - const SCEV *SrcA, *SrcB, *NBytes; - if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, - NBytes)) { - LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n"); - return false; - } - - transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes); - return true; -} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 368b9d4e8df1..901204181a7c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/User.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" @@ -226,7 +227,8 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, Optional<MemorySSAUpdater> MSSAU; if (AR.MSSA) { MSSAU = MemorySSAUpdater(AR.MSSA); - AR.MSSA->verifyMemorySSA(); + if (VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); } if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 1af4b21b432e..6ce2d06058cf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -716,22 +717,6 @@ bool LoopInterchangeLegality::findInductionAndReductions( return true; } -static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { - for (PHINode &PHI : Block->phis()) { - // Reduction lcssa phi will have only 1 incoming block that from loop latch. - if (PHI.getNumIncomingValues() > 1) - return false; - Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0)); - if (!Ins) - return false; - // Incoming value for lcssa phi's in outer loop exit can only be inner loop - // exits lcssa phi else it would not be tightly nested. - if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) - return false; - } - return true; -} - // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { @@ -830,21 +815,6 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: We only handle LCSSA PHI's corresponding to reduction for now. - BasicBlock *InnerExit = InnerLoop->getExitBlock(); - if (!containsSafePHI(InnerExit, false)) { - LLVM_DEBUG( - dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with LCSSA PHIs can be interchange " - "currently."; - }); - return true; - } - // TODO: Current limitation: Since we split the inner loop latch at the point // were induction variable is incremented (induction.next); We cannot have // more than 1 user of induction.next since it would result in broken code @@ -920,6 +890,28 @@ bool LoopInterchangeLegality::currentLimitations() { return false; } +// We currently only support LCSSA PHI nodes in the inner loop exit, if their +// users are either reduction PHIs or PHIs outside the outer loop (which means +// the we are only interested in the final value after the loop). +static bool +areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, + SmallPtrSetImpl<PHINode *> &Reductions) { + BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); + for (PHINode &PHI : InnerExit->phis()) { + // Reduction lcssa phi will have only 1 incoming block that from loop latch. + if (PHI.getNumIncomingValues() > 1) + return false; + if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { + PHINode *PN = dyn_cast<PHINode>(U); + return !PN || (Reductions.find(PN) == Reductions.end() && + OuterL->contains(PN->getParent())); + })) { + return false; + } + } + return true; +} + // We currently support LCSSA PHI nodes in the outer loop exit, if their // incoming values do not come from the outer loop latch or if the // outer loop latch has a single predecessor. In that case, the value will @@ -927,7 +919,7 @@ bool LoopInterchangeLegality::currentLimitations() { // will still be true after interchanging. If we have multiple predecessor, // that may not be the case, e.g. because the outer loop latch may be executed // if the inner loop is not executed. -static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { +static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); for (PHINode &PHI : LoopNestExit->phis()) { // FIXME: We currently are not able to detect floating point reductions @@ -1012,7 +1004,19 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, return false; } - if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) { + if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, + OuterInnerReductions)) { + LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Found unsupported PHI node in loop exit."; + }); + return false; + } + + if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", @@ -1315,31 +1319,39 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { FromBB->getTerminator()->getIterator()); } -/// Update BI to jump to NewBB instead of OldBB. Records updates to -/// the dominator tree in DTUpdates, if DT should be preserved. +// Update BI to jump to NewBB instead of OldBB. Records updates to the +// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that +// \p OldBB is exactly once in BI's successor list. static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, - std::vector<DominatorTree::UpdateType> &DTUpdates) { - assert(llvm::count_if(successors(BI), - [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 && - "BI must jump to OldBB at most once."); - for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) { - if (BI->getSuccessor(i) == OldBB) { - BI->setSuccessor(i, NewBB); - - DTUpdates.push_back( - {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); - DTUpdates.push_back( - {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); - break; + std::vector<DominatorTree::UpdateType> &DTUpdates, + bool MustUpdateOnce = true) { + assert((!MustUpdateOnce || + llvm::count_if(successors(BI), + [OldBB](BasicBlock *BB) { + return BB == OldBB; + }) == 1) && "BI must jump to OldBB exactly once."); + bool Changed = false; + for (Use &Op : BI->operands()) + if (Op == OldBB) { + Op.set(NewBB); + Changed = true; } + + if (Changed) { + DTUpdates.push_back( + {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); + DTUpdates.push_back( + {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); } + assert(Changed && "Expected a successor to be updated"); } // Move Lcssa PHIs to the right place. static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, - BasicBlock *OuterLatch, BasicBlock *OuterExit) { + BasicBlock *OuterLatch, BasicBlock *OuterExit, + Loop *InnerLoop, LoopInfo *LI) { // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are // defined either in the header or latch. Those blocks will become header and @@ -1394,19 +1406,17 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, P->moveBefore(InnerExit->getFirstNonPHI()); // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have - // incoming values from the outer latch or header, we have to add a new PHI + // incoming values defined in the outer loop, we have to add a new PHI // in the inner loop latch, which became the exit block of the outer loop, // after interchanging. if (OuterExit) { for (PHINode &P : OuterExit->phis()) { if (P.getNumIncomingValues() != 1) continue; - // Skip Phis with incoming values not defined in the outer loop's header - // and latch. Also skip incoming phis defined in the latch. Those should + // Skip Phis with incoming values defined in the inner loop. Those should // already have been updated. auto I = dyn_cast<Instruction>(P.getIncomingValue(0)); - if (!I || ((I->getParent() != OuterLatch || isa<PHINode>(I)) && - I->getParent() != OuterHeader)) + if (!I || LI->getLoopFor(I->getParent()) == InnerLoop) continue; PHINode *NewPhi = dyn_cast<PHINode>(P.clone()); @@ -1481,12 +1491,21 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (!InnerLoopHeaderSuccessor) return false; - // Adjust Loop Preheader and headers + // Adjust Loop Preheader and headers. + // The branches in the outer loop predecessor and the outer loop header can + // be unconditional branches or conditional branches with duplicates. Consider + // this when updating the successors. updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, - InnerLoopPreHeader, DTUpdates); - updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates); + InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); + // The outer loop header might or might not branch to the outer latch. + // We are guaranteed to branch to the inner loop preheader. + if (std::find(succ_begin(OuterLoopHeaderBI), succ_end(OuterLoopHeaderBI), + OuterLoopLatch) != succ_end(OuterLoopHeaderBI)) + updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates, + /*MustUpdateOnce=*/false); updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, - InnerLoopHeaderSuccessor, DTUpdates); + InnerLoopHeaderSuccessor, DTUpdates, + /*MustUpdateOnce=*/false); // Adjust reduction PHI's now that the incoming block has changed. InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader, @@ -1520,7 +1539,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() { OuterLoopPreHeader); moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch, - OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock()); + OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(), + InnerLoop, LI); // For PHIs in the exit block of the outer loop, outer's latch has been // replaced by Inners'. OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index e8dc879a184b..4e1b4e87ebc9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -49,6 +49,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -488,7 +489,7 @@ public: // Filter the candidates further. SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; unsigned NumForwarding = 0; - for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { + for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) { LLVM_DEBUG(dbgs() << "Candidate " << Cand); // Make sure that the stored values is available everywhere in the loop in @@ -544,7 +545,8 @@ public: auto *HeaderBB = L->getHeader(); auto *F = HeaderBB->getParent(); bool OptForSize = F->hasOptSize() || - llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI); + llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) { LLVM_DEBUG( dbgs() << "Versioning is needed but not allowed when optimizing " diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp index 885c0e8f4b8b..1a42f6b23443 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -191,9 +191,12 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/GuardUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -248,7 +251,9 @@ struct LoopICmp { class LoopPredication { AliasAnalysis *AA; + DominatorTree *DT; ScalarEvolution *SE; + LoopInfo *LI; BranchProbabilityInfo *BPI; Loop *L; @@ -300,10 +305,13 @@ class LoopPredication { // within the loop. We identify such unprofitable loops through BPI. bool isLoopProfitableToPredicate(); + bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); + public: - LoopPredication(AliasAnalysis *AA, ScalarEvolution *SE, + LoopPredication(AliasAnalysis *AA, DominatorTree *DT, + ScalarEvolution *SE, LoopInfo *LI, BranchProbabilityInfo *BPI) - : AA(AA), SE(SE), BPI(BPI){}; + : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {}; bool runOnLoop(Loop *L); }; @@ -323,10 +331,12 @@ public: if (skipLoop(L)) return false; auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - LoopPredication LP(AA, SE, &BPI); + LoopPredication LP(AA, DT, SE, LI, &BPI); return LP.runOnLoop(L); } }; @@ -352,7 +362,7 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); Function *F = L.getHeader()->getParent(); auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F); - LoopPredication LP(&AR.AA, &AR.SE, BPI); + LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, BPI); if (!LP.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -823,9 +833,9 @@ bool LoopPredication::widenWidenableBranchGuardConditions( Value *AllChecks = Builder.CreateAnd(Checks); auto *OldCond = BI->getCondition(); BI->setCondition(AllChecks); + RecursivelyDeleteTriviallyDeadInstructions(OldCond); assert(isGuardAsWidenableBranch(BI) && "Stopped being a guard after transform?"); - RecursivelyDeleteTriviallyDeadInstructions(OldCond); LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); return true; @@ -953,6 +963,233 @@ bool LoopPredication::isLoopProfitableToPredicate() { return true; } +/// If we can (cheaply) find a widenable branch which controls entry into the +/// loop, return it. +static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) { + // Walk back through any unconditional executed blocks and see if we can find + // a widenable condition which seems to control execution of this loop. Note + // that we predict that maythrow calls are likely untaken and thus that it's + // profitable to widen a branch before a maythrow call with a condition + // afterwards even though that may cause the slow path to run in a case where + // it wouldn't have otherwise. + BasicBlock *BB = L->getLoopPreheader(); + if (!BB) + return nullptr; + do { + if (BasicBlock *Pred = BB->getSinglePredecessor()) + if (BB == Pred->getSingleSuccessor()) { + BB = Pred; + continue; + } + break; + } while (true); + + if (BasicBlock *Pred = BB->getSinglePredecessor()) { + auto *Term = Pred->getTerminator(); + + Value *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) && + IfTrueBB == BB) + return cast<BranchInst>(Term); + } + return nullptr; +} + +/// Return the minimum of all analyzeable exit counts. This is an upper bound +/// on the actual exit count. If there are not at least two analyzeable exits, +/// returns SCEVCouldNotCompute. +static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE, + DominatorTree &DT, + Loop *L) { + SmallVector<BasicBlock *, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + SmallVector<const SCEV *, 4> ExitCounts; + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount)) + continue; + assert(DT.dominates(ExitingBB, L->getLoopLatch()) && + "We should only have known counts for exiting blocks that " + "dominate latch!"); + ExitCounts.push_back(ExitCount); + } + if (ExitCounts.size() < 2) + return SE.getCouldNotCompute(); + return SE.getUMinFromMismatchedTypes(ExitCounts); +} + +/// Return true if we can be fairly sure that executing block BB will probably +/// lead to executing an __llvm_deoptimize. This is a profitability heuristic, +/// not a legality constraint. +static bool isVeryLikelyToDeopt(BasicBlock *BB) { + while (BB->getUniqueSuccessor()) + // Will skip side effects, that's okay + BB = BB->getUniqueSuccessor(); + + return BB->getTerminatingDeoptimizeCall(); +} + +/// This implements an analogous, but entirely distinct transform from the main +/// loop predication transform. This one is phrased in terms of using a +/// widenable branch *outside* the loop to allow us to simplify loop exits in a +/// following loop. This is close in spirit to the IndVarSimplify transform +/// of the same name, but is materially different widening loosens legality +/// sharply. +bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { + // The transformation performed here aims to widen a widenable condition + // above the loop such that all analyzeable exit leading to deopt are dead. + // It assumes that the latch is the dominant exit for profitability and that + // exits branching to deoptimizing blocks are rarely taken. It relies on the + // semantics of widenable expressions for legality. (i.e. being able to fall + // down the widenable path spuriously allows us to ignore exit order, + // unanalyzeable exits, side effects, exceptional exits, and other challenges + // which restrict the applicability of the non-WC based version of this + // transform in IndVarSimplify.) + // + // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may + // imply flags on the expression being hoisted and inserting new uses (flags + // are only correct for current uses). The result is that we may be + // inserting a branch on the value which can be either poison or undef. In + // this case, the branch can legally go either way; we just need to avoid + // introducing UB. This is achieved through the use of the freeze + // instruction. + + SmallVector<BasicBlock *, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + if (ExitingBlocks.empty()) + return false; // Nothing to do. + + auto *Latch = L->getLoopLatch(); + if (!Latch) + return false; + + auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI); + if (!WidenableBR) + return false; + + const SCEV *LatchEC = SE->getExitCount(L, Latch); + if (isa<SCEVCouldNotCompute>(LatchEC)) + return false; // profitability - want hot exit in analyzeable set + + // At this point, we have found an analyzeable latch, and a widenable + // condition above the loop. If we have a widenable exit within the loop + // (for which we can't compute exit counts), drop the ability to further + // widen so that we gain ability to analyze it's exit count and perform this + // transform. TODO: It'd be nice to know for sure the exit became + // analyzeable after dropping widenability. + { + bool Invalidate = false; + + for (auto *ExitingBB : ExitingBlocks) { + if (LI->getLoopFor(ExitingBB) != L) + continue; + + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + + Use *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) && + L->contains(IfTrueBB)) { + WC->set(ConstantInt::getTrue(IfTrueBB->getContext())); + Invalidate = true; + } + } + if (Invalidate) + SE->forgetLoop(L); + } + + // The use of umin(all analyzeable exits) instead of latch is subtle, but + // important for profitability. We may have a loop which hasn't been fully + // canonicalized just yet. If the exit we chose to widen is provably never + // taken, we want the widened form to *also* be provably never taken. We + // can't guarantee this as a current unanalyzeable exit may later become + // analyzeable, but we can at least avoid the obvious cases. + const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L); + if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() || + !SE->isLoopInvariant(MinEC, L) || + !isSafeToExpandAt(MinEC, WidenableBR, *SE)) + return false; + + // Subtlety: We need to avoid inserting additional uses of the WC. We know + // that it can only have one transitive use at the moment, and thus moving + // that use to just before the branch and inserting code before it and then + // modifying the operand is legal. + auto *IP = cast<Instruction>(WidenableBR->getCondition()); + IP->moveBefore(WidenableBR); + Rewriter.setInsertPoint(IP); + IRBuilder<> B(IP); + + bool Changed = false; + Value *MinECV = nullptr; // lazily generated if needed + for (BasicBlock *ExitingBB : ExitingBlocks) { + // If our exiting block exits multiple loops, we can only rewrite the + // innermost one. Otherwise, we're changing how many times the innermost + // loop runs before it exits. + if (LI->getLoopFor(ExitingBB) != L) + continue; + + // Can't rewrite non-branch yet. + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + + // If already constant, nothing to do. + if (isa<Constant>(BI->getCondition())) + continue; + + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount) || + ExitCount->getType()->isPointerTy() || + !isSafeToExpandAt(ExitCount, WidenableBR, *SE)) + continue; + + const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); + BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1); + if (!isVeryLikelyToDeopt(ExitBB)) + // Profitability: indicator of rarely/never taken exit + continue; + + // If we found a widenable exit condition, do two things: + // 1) fold the widened exit test into the widenable condition + // 2) fold the branch to untaken - avoids infinite looping + + Value *ECV = Rewriter.expandCodeFor(ExitCount); + if (!MinECV) + MinECV = Rewriter.expandCodeFor(MinEC); + Value *RHS = MinECV; + if (ECV->getType() != RHS->getType()) { + Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); + ECV = B.CreateZExt(ECV, WiderTy); + RHS = B.CreateZExt(RHS, WiderTy); + } + assert(!Latch || DT->dominates(ExitingBB, Latch)); + Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS); + // Freeze poison or undef to an arbitrary bit pattern to ensure we can + // branch without introducing UB. See NOTE ON POISON/UNDEF above for + // context. + NewCond = B.CreateFreeze(NewCond); + + widenWidenableBranch(WidenableBR, NewCond); + + Value *OldCond = BI->getCondition(); + BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue)); + Changed = true; + } + + if (Changed) + // We just mutated a bunch of loop exits changing there exit counts + // widely. We need to force recomputation of the exit counts given these + // changes. Note that all of the inserted exits are never taken, and + // should be removed next time the CFG is modified. + SE->forgetLoop(L); + return Changed; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; @@ -1004,16 +1241,12 @@ bool LoopPredication::runOnLoop(Loop *Loop) { cast<BranchInst>(BB->getTerminator())); } - if (Guards.empty() && GuardsAsWidenableBranches.empty()) - return false; - SCEVExpander Expander(*SE, *DL, "loop-predication"); - bool Changed = false; for (auto *Guard : Guards) Changed |= widenGuardConditions(Guard, Expander); for (auto *Guard : GuardsAsWidenableBranches) Changed |= widenWidenableBranchGuardConditions(Guard, Expander); - + Changed |= predicateLoopExits(L, Expander); return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 96e2c2a3ac6b..da13a342ae12 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -45,6 +44,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -53,6 +53,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> #include <cstddef> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp index 94517996df39..0868e742f4ee 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -18,6 +18,8 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 299f3fc5fb19..b27e65e0adb7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -30,6 +30,8 @@ #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils.h" @@ -660,6 +662,9 @@ static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, // Merge Succ into Pred and delete it. MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + Changed = true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp index 65e0dee0225a..1c03a4bf6c02 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -41,14 +41,15 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 7f119175c4a8..e9f368628a08 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -74,7 +74,6 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -97,6 +96,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -108,6 +108,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -115,8 +116,8 @@ #include <cstdlib> #include <iterator> #include <limits> -#include <numeric> #include <map> +#include <numeric> #include <utility> using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 8d88be420314..92ad8dafa5ab 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -426,51 +427,76 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, return UnrollResult; } +static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI, + ScalarEvolution &SE, + const TargetTransformInfo &TTI, + AssumptionCache &AC, DependenceInfo &DI, + OptimizationRemarkEmitter &ORE, + int OptLevel) { + bool DidSomething = false; + + // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop unroll and jam pass + // will simplify all loops, regardless of whether anything end up being + // unroll and jammed. + for (auto &L : LI) { + DidSomething |= + simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); + DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE); + } + + SmallPriorityWorklist<Loop *, 4> Worklist; + internal::appendLoopsToWorklist(reverse(LI), Worklist); + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + formLCSSA(*L, DT, &LI, &SE); + LoopUnrollResult Result = + tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel); + if (Result != LoopUnrollResult::Unmodified) + DidSomething = true; + } + + return DidSomething; +} + namespace { -class LoopUnrollAndJam : public LoopPass { +class LoopUnrollAndJam : public FunctionPass { public: static char ID; // Pass ID, replacement for typeid unsigned OptLevel; - LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) { + LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) { initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM) override { - if (skipLoop(L)) + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; - Function &F = *L->getHeader()->getParent(); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); - // For the old PM, we can't use OptimizationRemarkEmitter as an analysis - // pass. Function analyses need to be preserved across loop transformations - // but ORE cannot be preserved (see comment before the pass definition). - OptimizationRemarkEmitter ORE(&F); - - LoopUnrollResult Result = - tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - if (Result == LoopUnrollResult::FullyUnrolled) - LPM.markLoopAsDeleted(*L); - - return Result != LoopUnrollResult::Unmodified; + return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); } /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DependenceAnalysisWrapperPass>(); - getLoopAnalysisUsage(AU); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } }; @@ -480,10 +506,13 @@ char LoopUnrollAndJam::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam", "Unroll and Jam loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam", "Unroll and Jam loops", false, false) @@ -491,26 +520,18 @@ Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) { return new LoopUnrollAndJam(OptLevel); } -PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &) { - const auto &FAM = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); - Function *F = L.getHeader()->getParent(); - - auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F); - // FIXME: This should probably be optional rather than required. - if (!ORE) - report_fatal_error( - "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at " - "a higher level"); - - DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); - - LoopUnrollResult Result = tryToUnrollAndJamLoop( - &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel); - - if (Result == LoopUnrollResult::Unmodified) +PreservedAnalyses LoopUnrollAndJamPass::run(Function &F, + FunctionAnalysisManager &AM) { + ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + LoopInfo &LI = AM.getResult<LoopAnalysis>(F); + TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); + DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F); + OptimizationRemarkEmitter &ORE = + AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index a6d4164c3645..4c2b079c6bb5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -46,6 +46,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -212,7 +213,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( // Apply size attributes bool OptForSize = L->getHeader()->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI); + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index b410df0c5f68..915e053704b2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -59,6 +59,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -683,7 +684,7 @@ bool LoopUnswitch::processCurrentLoop() { for (auto &I : *BB) { auto CS = CallSite(&I); if (!CS) continue; - if (CS.hasFnAttr(Attribute::Convergent)) + if (CS.isConvergent()) return false; if (auto *II = dyn_cast<InvokeInst>(&I)) if (!II->getUnwindDest()->canSplitPredecessors()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 2ccb7cae3079..7b9af527d444 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -79,6 +79,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index e076424d9042..ab7b85e89e7b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp index d0fcf38b5a7b..21c6c32e8e02 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index d85f20b3f80c..53671c7bc3d1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index 9489e01774d6..45f5929e3b90 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" @@ -60,7 +61,7 @@ static bool lowerGuardIntrinsic(Function &F) { DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); for (auto *CI : ToLower) { - makeGuardControlFlowExplicit(DeoptIntrinsic, CI); + makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false); CI->eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp new file mode 100644 index 000000000000..0ff6ee8bcfcc --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -0,0 +1,894 @@ +//===- LowerMatrixIntrinsics.cpp - Lower matrix intrinsics -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower matrix intrinsics to vector operations. +// +// TODO: +// * Implement multiply & add fusion +// * Add remark, summarizing the available matrix optimization opportunities. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "lower-matrix-intrinsics" + +static cl::opt<bool> EnableShapePropagation("matrix-propagate-shape", + cl::init(true)); + +static cl::opt<bool> AllowContractEnabled( + "matrix-allow-contract", cl::init(false), cl::Hidden, + cl::desc("Allow the use of FMAs if available and profitable. This may " + "result in different results, due to less rounding error.")); + +namespace { + +// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute +// the start address of column \p Col with type (\p EltType x \p NumRows) +// assuming \p Stride elements between start two consecutive columns. +// \p Stride must be >= \p NumRows. +// +// Consider a 4x4 matrix like below +// +// 0 1 2 3 +// 0 v_0_0 v_0_1 v_0_2 v_0_3 +// 1 v_1_0 v_1_1 v_1_2 v_1_3 +// 2 v_2_0 v_2_1 v_2_2 v_2_3 +// 3 v_3_0 v_3_1 v_3_2 v_3_3 + +// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1, +// we need a pointer to the first element of the submatrix as base pointer. +// Then we can use computeColumnAddr to compute the addresses for the columns +// of the sub-matrix. +// +// Column 0: computeColumnAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..) +// -> just returns Base +// Column 1: computeColumnAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (1 * 4) +// Column 2: computeColumnAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (2 * 4) +// +// The graphic below illustrates the number of elements in a column (marked +// with |) and the number of skipped elements (marked with }). +// +// v_0_0 v_0_1 {v_0_2 {v_0_3 +// Base Col 1 Col 2 +// | | | +// v_1_0 |v_1_1 |v_1_2 |v_1_3 +// v_2_0 |v_2_1 |v_2_2 |v_2_3 +// v_3_0 {v_3_1 {v_3_2 v_3_3 +// +Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride, + unsigned NumRows, Type *EltType, + IRBuilder<> &Builder) { + + assert((!isa<ConstantInt>(Stride) || + cast<ConstantInt>(Stride)->getZExtValue() >= NumRows) && + "Stride must be >= the number of rows."); + unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); + + // Compute the start of the column with index Col as Col * Stride. + Value *ColumnStart = Builder.CreateMul(Col, Stride, "col.start"); + + // Get pointer to the start of the selected column. Skip GEP creation, + // if we select column 0. + if (isa<ConstantInt>(ColumnStart) && cast<ConstantInt>(ColumnStart)->isZero()) + ColumnStart = BasePtr; + else + ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart, "col.gep"); + + // Cast elementwise column start pointer to a pointer to a column + // (EltType x NumRows)*. + Type *ColumnType = VectorType::get(EltType, NumRows); + Type *ColumnPtrType = PointerType::get(ColumnType, AS); + return Builder.CreatePointerCast(ColumnStart, ColumnPtrType, "col.cast"); +} + +/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. +/// +/// Currently, the lowering for each matrix intrinsic is done as follows: +/// 1. Propagate the shape information from intrinsics to connected +/// instructions. +/// 2. Lower instructions with shape information. +/// 2.1. Get column vectors for each argument. If we already lowered the +/// definition of an argument, use the produced column vectors directly. +/// If not, split the operand vector containing an embedded matrix into +/// a set of column vectors, +/// 2.2. Lower the instruction in terms of columnwise operations, which yields +/// a set of column vectors containing result matrix. Note that we lower +/// all instructions that have shape information. Besides the intrinsics, +/// this includes stores for example. +/// 2.3. Update uses of the lowered instruction. If we have shape information +/// for a user, there is nothing to do, as we will look up the result +/// column matrix when lowering the user. For other uses, we embed the +/// result matrix in a flat vector and update the use. +/// 2.4. Cache the result column matrix for the instruction we lowered +/// 3. After we lowered all instructions in a function, remove the now +/// obsolete instructions. +/// +class LowerMatrixIntrinsics { + Function &Func; + const DataLayout &DL; + const TargetTransformInfo &TTI; + + /// Wrapper class representing a matrix as a set of column vectors. + /// All column vectors must have the same vector type. + class ColumnMatrixTy { + SmallVector<Value *, 16> Columns; + + public: + ColumnMatrixTy() : Columns() {} + ColumnMatrixTy(ArrayRef<Value *> Cols) + : Columns(Cols.begin(), Cols.end()) {} + + Value *getColumn(unsigned i) const { return Columns[i]; } + + void setColumn(unsigned i, Value *V) { Columns[i] = V; } + + size_t getNumColumns() const { return Columns.size(); } + size_t getNumRows() const { + assert(Columns.size() > 0 && "Cannot call getNumRows without columns"); + return cast<VectorType>(Columns[0]->getType())->getNumElements(); + } + + const SmallVectorImpl<Value *> &getColumnVectors() const { return Columns; } + + SmallVectorImpl<Value *> &getColumnVectors() { return Columns; } + + void addColumn(Value *V) { Columns.push_back(V); } + + iterator_range<SmallVector<Value *, 8>::iterator> columns() { + return make_range(Columns.begin(), Columns.end()); + } + + /// Embed the columns of the matrix into a flat vector by concatenating + /// them. + Value *embedInVector(IRBuilder<> &Builder) const { + return Columns.size() == 1 ? Columns[0] + : concatenateVectors(Builder, Columns); + } + }; + + struct ShapeInfo { + unsigned NumRows; + unsigned NumColumns; + + ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) + : NumRows(NumRows), NumColumns(NumColumns) {} + + ShapeInfo(Value *NumRows, Value *NumColumns) + : NumRows(cast<ConstantInt>(NumRows)->getZExtValue()), + NumColumns(cast<ConstantInt>(NumColumns)->getZExtValue()) {} + + bool operator==(const ShapeInfo &other) { + return NumRows == other.NumRows && NumColumns == other.NumColumns; + } + bool operator!=(const ShapeInfo &other) { return !(*this == other); } + + /// Returns true if shape-information is defined, meaning both dimensions + /// are != 0. + operator bool() const { + assert(NumRows == 0 || NumColumns != 0); + return NumRows != 0; + } + }; + + /// Maps instructions to their shape information. The shape information + /// describes the shape to be used while lowering. This matches the shape of + /// the result value of the instruction, with the only exceptions being store + /// instructions and the matrix_columnwise_store intrinsics. For those, the + /// shape information indicates that those instructions should be lowered + /// using shape information as well. + DenseMap<Value *, ShapeInfo> ShapeMap; + + /// List of instructions to remove. While lowering, we are not replacing all + /// users of a lowered instruction, if shape information is available and + /// those need to be removed after we finished lowering. + SmallVector<Instruction *, 16> ToRemove; + + /// Map from instructions to their produced column matrix. + DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix; + +public: + LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI) + : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {} + + /// Return the set of column vectors that a matrix value is lowered to. + /// + /// If we lowered \p MatrixVal, just return the cache result column matrix. + /// Otherwie split the flat vector \p MatrixVal containing a matrix with + /// shape \p SI into column vectors. + ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, + IRBuilder<> Builder) { + VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType()); + assert(VType && "MatrixVal must be a vector type"); + assert(VType->getNumElements() == SI.NumRows * SI.NumColumns && + "The vector size must match the number of matrix elements"); + + // Check if we lowered MatrixVal using shape information. In that case, + // return the existing column matrix, if it matches the requested shape + // information. If there is a mis-match, embed the result in a flat + // vector and split it later. + auto Found = Inst2ColumnMatrix.find(MatrixVal); + if (Found != Inst2ColumnMatrix.end()) { + ColumnMatrixTy &M = Found->second; + // Return the found matrix, if its shape matches the requested shape + // information + if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) + return M; + + MatrixVal = M.embedInVector(Builder); + } + + // Otherwise split MatrixVal. + SmallVector<Value *, 16> SplitVecs; + Value *Undef = UndefValue::get(VType); + for (unsigned MaskStart = 0; MaskStart < VType->getNumElements(); + MaskStart += SI.NumRows) { + Constant *Mask = createSequentialMask(Builder, MaskStart, SI.NumRows, 0); + Value *V = Builder.CreateShuffleVector(MatrixVal, Undef, Mask, "split"); + SplitVecs.push_back(V); + } + + return {SplitVecs}; + } + + /// If \p V already has a known shape return false. Otherwise set the shape + /// for instructions that support it. + bool setShapeInfo(Value *V, ShapeInfo Shape) { + assert(Shape && "Shape not set"); + if (isa<UndefValue>(V) || !supportsShapeInfo(V)) + return false; + + auto SIter = ShapeMap.find(V); + if (SIter != ShapeMap.end()) { + LLVM_DEBUG(dbgs() << " not overriding existing shape: " + << SIter->second.NumRows << " " + << SIter->second.NumColumns << " for " << *V << "\n"); + return false; + } + + ShapeMap.insert({V, Shape}); + LLVM_DEBUG(dbgs() << " " << Shape.NumRows << " x " << Shape.NumColumns + << " for " << *V << "\n"); + return true; + } + + bool isUniformShape(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return true; + + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: // Scalar multiply. + case Instruction::Add: + case Instruction::Mul: + case Instruction::Sub: + return true; + default: + return false; + } + } + + /// Returns true if shape information can be used for \p V. The supported + /// instructions must match the instructions that can be lowered by this pass. + bool supportsShapeInfo(Value *V) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + return false; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (II) + switch (II->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + case Intrinsic::matrix_transpose: + case Intrinsic::matrix_columnwise_load: + case Intrinsic::matrix_columnwise_store: + return true; + default: + return false; + } + return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V); + } + + /// Propagate the shape information of instructions to their users. + /// The work list contains instructions for which we can compute the shape, + /// either based on the information provided by matrix intrinsics or known + /// shapes of operands. + SmallVector<Instruction *, 32> + propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) { + SmallVector<Instruction *, 32> NewWorkList; + // Pop an element for which we guaranteed to have at least one of the + // operand shapes. Add the shape for this and then add users to the work + // list. + LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n"); + while (!WorkList.empty()) { + Instruction *Inst = WorkList.back(); + WorkList.pop_back(); + + // New entry, set the value and insert operands + bool Propagate = false; + + Value *MatrixA; + Value *MatrixB; + Value *M; + Value *N; + Value *K; + if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(MatrixA), m_Value(MatrixB), m_Value(M), + m_Value(N), m_Value(K)))) { + Propagate = setShapeInfo(Inst, {M, K}); + } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>( + m_Value(MatrixA), m_Value(M), m_Value(N)))) { + // Flip dimensions. + Propagate = setShapeInfo(Inst, {N, M}); + } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_columnwise_store>( + m_Value(MatrixA), m_Value(), m_Value(), + m_Value(M), m_Value(N)))) { + Propagate = setShapeInfo(Inst, {N, M}); + } else if (match(Inst, + m_Intrinsic<Intrinsic::matrix_columnwise_load>( + m_Value(), m_Value(), m_Value(M), m_Value(N)))) { + Propagate = setShapeInfo(Inst, {M, N}); + } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { + auto OpShape = ShapeMap.find(MatrixA); + if (OpShape != ShapeMap.end()) + setShapeInfo(Inst, OpShape->second); + continue; + } else if (isUniformShape(Inst)) { + // Find the first operand that has a known shape and use that. + for (auto &Op : Inst->operands()) { + auto OpShape = ShapeMap.find(Op.get()); + if (OpShape != ShapeMap.end()) { + Propagate |= setShapeInfo(Inst, OpShape->second); + break; + } + } + } + + if (Propagate) { + NewWorkList.push_back(Inst); + for (auto *User : Inst->users()) + if (ShapeMap.count(User) == 0) + WorkList.push_back(cast<Instruction>(User)); + } + } + + return NewWorkList; + } + + /// Propagate the shape to operands of instructions with shape information. + /// \p Worklist contains the instruction for which we already know the shape. + SmallVector<Instruction *, 32> + propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) { + SmallVector<Instruction *, 32> NewWorkList; + + auto pushInstruction = [](Value *V, + SmallVectorImpl<Instruction *> &WorkList) { + Instruction *I = dyn_cast<Instruction>(V); + if (I) + WorkList.push_back(I); + }; + // Pop an element with known shape. Traverse the operands, if their shape + // derives from the result shape and is unknown, add it and add them to the + // worklist. + LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n"); + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + + size_t BeforeProcessingV = WorkList.size(); + if (!isa<Instruction>(V)) + continue; + + Value *MatrixA; + Value *MatrixB; + Value *M; + Value *N; + Value *K; + if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(MatrixA), m_Value(MatrixB), m_Value(M), + m_Value(N), m_Value(K)))) { + if (setShapeInfo(MatrixA, {M, N})) + pushInstruction(MatrixA, WorkList); + + if (setShapeInfo(MatrixB, {N, K})) + pushInstruction(MatrixB, WorkList); + + } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>( + m_Value(MatrixA), m_Value(M), m_Value(N)))) { + // Flip dimensions. + if (setShapeInfo(MatrixA, {M, N})) + pushInstruction(MatrixA, WorkList); + } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>( + m_Value(MatrixA), m_Value(), m_Value(), + m_Value(M), m_Value(N)))) { + if (setShapeInfo(MatrixA, {M, N})) { + pushInstruction(MatrixA, WorkList); + } + } else if (isa<LoadInst>(V) || + match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) { + // Nothing to do, no matrix input. + } else if (isa<StoreInst>(V)) { + // Nothing to do. We forward-propagated to this so we would just + // backward propagate to an instruction with an already known shape. + } else if (isUniformShape(V)) { + // Propagate to all operands. + ShapeInfo Shape = ShapeMap[V]; + for (Use &U : cast<Instruction>(V)->operands()) { + if (setShapeInfo(U.get(), Shape)) + pushInstruction(U.get(), WorkList); + } + } + // After we discovered new shape info for new instructions in the + // worklist, we use their users as seeds for the next round of forward + // propagation. + for (size_t I = BeforeProcessingV; I != WorkList.size(); I++) + for (User *U : WorkList[I]->users()) + if (isa<Instruction>(U) && V != U) + NewWorkList.push_back(cast<Instruction>(U)); + } + return NewWorkList; + } + + bool Visit() { + if (EnableShapePropagation) { + SmallVector<Instruction *, 32> WorkList; + + // Initially only the shape of matrix intrinsics is known. + // Initialize the work list with ops carrying shape information. + for (BasicBlock &BB : Func) + for (Instruction &Inst : BB) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst); + if (!II) + continue; + + switch (II->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + case Intrinsic::matrix_transpose: + case Intrinsic::matrix_columnwise_load: + case Intrinsic::matrix_columnwise_store: + WorkList.push_back(&Inst); + break; + default: + break; + } + } + // Propagate shapes until nothing changes any longer. + while (!WorkList.empty()) { + WorkList = propagateShapeForward(WorkList); + WorkList = propagateShapeBackward(WorkList); + } + } + + ReversePostOrderTraversal<Function *> RPOT(&Func); + bool Changed = false; + for (auto *BB : RPOT) { + for (Instruction &Inst : make_early_inc_range(*BB)) { + IRBuilder<> Builder(&Inst); + + if (CallInst *CInst = dyn_cast<CallInst>(&Inst)) + Changed |= VisitCallInst(CInst); + + Value *Op1; + Value *Op2; + if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst)) + Changed |= VisitBinaryOperator(BinOp); + if (match(&Inst, m_Load(m_Value(Op1)))) + Changed |= VisitLoad(&Inst, Op1, Builder); + else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2)))) + Changed |= VisitStore(&Inst, Op1, Op2, Builder); + } + } + + for (Instruction *Inst : reverse(ToRemove)) + Inst->eraseFromParent(); + + return Changed; + } + + LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType, + IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedLoad(ColumnPtr, Align, "col.load"); + } + + StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr, + Type *EltType, IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedStore(ColumnValue, ColumnPtr, Align); + } + + + /// Turns \p BasePtr into an elementwise pointer to \p EltType. + Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) { + unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); + Type *EltPtrType = PointerType::get(EltType, AS); + return Builder.CreatePointerCast(BasePtr, EltPtrType); + } + + /// Replace intrinsic calls + bool VisitCallInst(CallInst *Inst) { + if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic()) + return false; + + switch (Inst->getCalledFunction()->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + LowerMultiply(Inst); + break; + case Intrinsic::matrix_transpose: + LowerTranspose(Inst); + break; + case Intrinsic::matrix_columnwise_load: + LowerColumnwiseLoad(Inst); + break; + case Intrinsic::matrix_columnwise_store: + LowerColumnwiseStore(Inst); + break; + default: + return false; + } + return true; + } + + void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto VType = cast<VectorType>(Inst->getType()); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + ColumnMatrixTy Result; + // Distance between start of one column and the start of the next + for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C), Stride, Shape.NumRows, + VType->getElementType(), Builder); + Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder); + Result.addColumn(Column); + } + + finalizeLowering(Inst, Result, Builder); + } + + /// Lowers llvm.matrix.columnwise.load. + /// + /// The intrinsic loads a matrix from memory using a stride between columns. + void LowerColumnwiseLoad(CallInst *Inst) { + Value *Ptr = Inst->getArgOperand(0); + Value *Stride = Inst->getArgOperand(1); + LowerLoad(Inst, Ptr, Stride, + {Inst->getArgOperand(2), Inst->getArgOperand(3)}); + } + + void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto VType = cast<VectorType>(Matrix->getType()); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + auto LM = getMatrix(Matrix, Shape, Builder); + for (auto C : enumerate(LM.columns())) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride, + Shape.NumRows, VType->getElementType(), Builder); + createColumnStore(C.value(), GEP, VType->getElementType(), Builder); + } + + ToRemove.push_back(Inst); + } + + /// Lowers llvm.matrix.columnwise.store. + /// + /// The intrinsic store a matrix back memory using a stride between columns. + void LowerColumnwiseStore(CallInst *Inst) { + Value *Matrix = Inst->getArgOperand(0); + Value *Ptr = Inst->getArgOperand(1); + Value *Stride = Inst->getArgOperand(2); + LowerStore(Inst, Matrix, Ptr, Stride, + {Inst->getArgOperand(3), Inst->getArgOperand(4)}); + } + + /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from + /// the matrix \p LM represented as a vector of column vectors. + Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J, + unsigned NumElts, IRBuilder<> Builder) { + Value *Col = LM.getColumn(J); + Value *Undef = UndefValue::get(Col->getType()); + Constant *Mask = createSequentialMask(Builder, I, NumElts, 0); + return Builder.CreateShuffleVector(Col, Undef, Mask, "block"); + } + + // Set elements I..I+NumElts-1 to Block + Value *insertVector(Value *Col, unsigned I, Value *Block, + IRBuilder<> Builder) { + + // First, bring Block to the same size as Col + unsigned BlockNumElts = + cast<VectorType>(Block->getType())->getNumElements(); + unsigned NumElts = cast<VectorType>(Col->getType())->getNumElements(); + assert(NumElts >= BlockNumElts && "Too few elements for current block"); + + Value *ExtendMask = + createSequentialMask(Builder, 0, BlockNumElts, NumElts - BlockNumElts); + Value *Undef = UndefValue::get(Block->getType()); + Block = Builder.CreateShuffleVector(Block, Undef, ExtendMask); + + // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7, + // 8, 4, 5, 6 + SmallVector<Constant *, 16> Mask; + unsigned i; + for (i = 0; i < I; i++) + Mask.push_back(Builder.getInt32(i)); + + unsigned VecNumElts = cast<VectorType>(Col->getType())->getNumElements(); + for (; i < I + BlockNumElts; i++) + Mask.push_back(Builder.getInt32(i - I + VecNumElts)); + + for (; i < VecNumElts; i++) + Mask.push_back(Builder.getInt32(i)); + + Value *MaskVal = ConstantVector::get(Mask); + + return Builder.CreateShuffleVector(Col, Block, MaskVal); + } + + Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, + IRBuilder<> &Builder, bool AllowContraction) { + + if (!Sum) + return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); + + if (UseFPOp) { + if (AllowContraction) { + // Use fmuladd for floating point operations and let the backend decide + // if that's profitable. + Value *FMulAdd = Intrinsic::getDeclaration( + Func.getParent(), Intrinsic::fmuladd, A->getType()); + return Builder.CreateCall(FMulAdd, {A, B, Sum}); + } + Value *Mul = Builder.CreateFMul(A, B); + return Builder.CreateFAdd(Sum, Mul); + } + + Value *Mul = Builder.CreateMul(A, B); + return Builder.CreateAdd(Sum, Mul); + } + + /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For + /// users with shape information, there's nothing to do: the will use the + /// cached value when they are lowered. For other users, \p Matrix is + /// flattened and the uses are updated to use it. Also marks \p Inst for + /// deletion. + void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix, + IRBuilder<> &Builder) { + Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); + + ToRemove.push_back(Inst); + Value *Flattened = nullptr; + for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) { + Use &U = *I++; + if (ShapeMap.find(U.getUser()) == ShapeMap.end()) { + if (!Flattened) + Flattened = Matrix.embedInVector(Builder); + U.set(Flattened); + } + } + } + + /// Lowers llvm.matrix.multiply. + void LowerMultiply(CallInst *MatMul) { + IRBuilder<> Builder(MatMul); + auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); + ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); + ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); + + const ColumnMatrixTy &Lhs = + getMatrix(MatMul->getArgOperand(0), LShape, Builder); + const ColumnMatrixTy &Rhs = + getMatrix(MatMul->getArgOperand(1), RShape, Builder); + + const unsigned R = LShape.NumRows; + const unsigned M = LShape.NumColumns; + const unsigned C = RShape.NumColumns; + assert(M == RShape.NumRows); + + // Initialize the output + ColumnMatrixTy Result; + for (unsigned J = 0; J < C; ++J) + Result.addColumn(UndefValue::get(VectorType::get(EltType, R))); + + const unsigned VF = std::max(TTI.getRegisterBitWidth(true) / + EltType->getPrimitiveSizeInBits(), + uint64_t(1)); + + bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) && + MatMul->hasAllowContract()); + // Multiply columns from the first operand with scalars from the second + // operand. Then move along the K axes and accumulate the columns. With + // this the adds can be vectorized without reassociation. + for (unsigned J = 0; J < C; ++J) { + unsigned BlockSize = VF; + for (unsigned I = 0; I < R; I += BlockSize) { + // Gradually lower the vectorization factor to cover the remainder. + while (I + BlockSize > R) + BlockSize /= 2; + + Value *Sum = nullptr; + for (unsigned K = 0; K < M; ++K) { + Value *L = extractVector(Lhs, I, K, BlockSize, Builder); + Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K); + Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); + Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(), + Builder, AllowContract); + } + Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder)); + } + } + finalizeLowering(MatMul, Result, Builder); + } + + /// Lowers llvm.matrix.transpose. + void LowerTranspose(CallInst *Inst) { + ColumnMatrixTy Result; + IRBuilder<> Builder(Inst); + Value *InputVal = Inst->getArgOperand(0); + VectorType *VectorTy = cast<VectorType>(InputVal->getType()); + ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); + ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); + + for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) { + // Build a single column vector for this row. First initialize it. + Value *ResultColumn = UndefValue::get( + VectorType::get(VectorTy->getElementType(), ArgShape.NumColumns)); + + // Go through the elements of this row and insert it into the resulting + // column vector. + for (auto C : enumerate(InputMatrix.columns())) { + Value *Elt = Builder.CreateExtractElement(C.value(), Row); + // We insert at index Column since that is the row index after the + // transpose. + ResultColumn = + Builder.CreateInsertElement(ResultColumn, Elt, C.index()); + } + Result.addColumn(ResultColumn); + } + + finalizeLowering(Inst, Result, Builder); + } + + /// Lower load instructions, if shape information is available. + bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) { + auto I = ShapeMap.find(Inst); + if (I == ShapeMap.end()) + return false; + + LowerLoad(Inst, Ptr, Builder.getInt32(I->second.NumRows), I->second); + return true; + } + + bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr, + IRBuilder<> &Builder) { + auto I = ShapeMap.find(StoredVal); + if (I == ShapeMap.end()) + return false; + + LowerStore(Inst, StoredVal, Ptr, Builder.getInt32(I->second.NumRows), I->second); + return true; + } + + /// Lower binary operators, if shape information is available. + bool VisitBinaryOperator(BinaryOperator *Inst) { + auto I = ShapeMap.find(Inst); + if (I == ShapeMap.end()) + return false; + + Value *Lhs = Inst->getOperand(0); + Value *Rhs = Inst->getOperand(1); + + IRBuilder<> Builder(Inst); + ShapeInfo &Shape = I->second; + + ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder); + ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder); + + // Add each column and store the result back into the opmapping + ColumnMatrixTy Result; + auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) { + switch (Inst->getOpcode()) { + case Instruction::Add: + return Builder.CreateAdd(LHS, RHS); + case Instruction::Mul: + return Builder.CreateMul(LHS, RHS); + case Instruction::Sub: + return Builder.CreateSub(LHS, RHS); + case Instruction::FAdd: + return Builder.CreateFAdd(LHS, RHS); + case Instruction::FMul: + return Builder.CreateFMul(LHS, RHS); + case Instruction::FSub: + return Builder.CreateFSub(LHS, RHS); + default: + llvm_unreachable("Unsupported binary operator for matrix"); + } + }; + for (unsigned C = 0; C < Shape.NumColumns; ++C) + Result.addColumn( + BuildColumnOp(LoweredLhs.getColumn(C), LoweredRhs.getColumn(C))); + + finalizeLowering(Inst, Result, Builder); + return true; + } +}; +} // namespace + +PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + LowerMatrixIntrinsics LMT(F, TTI); + if (LMT.Visit()) { + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; + } + return PreservedAnalyses::all(); +} + +namespace { + +class LowerMatrixIntrinsicsLegacyPass : public FunctionPass { +public: + static char ID; + + LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) { + initializeLowerMatrixIntrinsicsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + LowerMatrixIntrinsics LMT(F, *TTI); + bool C = LMT.Visit(); + return C; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} // namespace + +static const char pass_name[] = "Lower the matrix intrinsics"; +char LowerMatrixIntrinsicsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) +INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) + +Pass *llvm::createLowerMatrixIntrinsicsPass() { + return new LowerMatrixIntrinsicsLegacyPass(); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp index 5342f2ddcb6b..73b2cd06fa23 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index 789232e0f5ce..5ffae128f5f0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -33,10 +33,11 @@ #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" #include "llvm/Analysis/GuardUtils.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" @@ -56,23 +57,11 @@ struct MakeGuardsExplicitLegacyPass : public FunctionPass { static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) { // Replace the guard with an explicit branch (just like in GuardWidening). - BasicBlock *BB = Guard->getParent(); - makeGuardControlFlowExplicit(DeoptIntrinsic, Guard); - BranchInst *ExplicitGuard = cast<BranchInst>(BB->getTerminator()); - assert(ExplicitGuard->isConditional() && "Must be!"); + BasicBlock *OriginalBB = Guard->getParent(); + (void)OriginalBB; + makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true); + assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold"); - // We want the guard to be expressed as explicit control flow, but still be - // widenable. For that, we add Widenable Condition intrinsic call to the - // guard's condition. - IRBuilder<> B(ExplicitGuard); - auto *WidenableCondition = - B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, - {}, {}, nullptr, "widenable_cond"); - WidenableCondition->setCallingConv(Guard->getCallingConv()); - auto *NewCond = - B.CreateAnd(ExplicitGuard->getCondition(), WidenableCondition); - NewCond->setName("exiplicit_guard_cond"); - ExplicitGuard->setCondition(NewCond); Guard->eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 2364748efb05..c24fa40860eb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -24,7 +24,6 @@ #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -49,12 +48,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -387,16 +388,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, StartPtr = Range.StartPtr; // Determine alignment - unsigned Alignment = Range.Alignment; - if (Alignment == 0) { - Type *EltType = - cast<PointerType>(StartPtr->getType())->getElementType(); - Alignment = DL.getABITypeAlignment(EltType); - } - - AMemSet = - Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + const Align Alignment = DL.getValueOrABITypeAlignment( + MaybeAlign(Range.Alignment), + cast<PointerType>(StartPtr->getType())->getElementType()); + AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start, + Alignment); LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI : Range.TheStores) dbgs() << *SI << '\n'; @@ -416,25 +413,21 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, return AMemSet; } -static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) { - unsigned StoreAlign = SI->getAlignment(); - if (!StoreAlign) - StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); - return StoreAlign; +static Align findStoreAlignment(const DataLayout &DL, const StoreInst *SI) { + return DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), + SI->getOperand(0)->getType()); } -static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) { - unsigned LoadAlign = LI->getAlignment(); - if (!LoadAlign) - LoadAlign = DL.getABITypeAlignment(LI->getType()); - return LoadAlign; +static Align findLoadAlignment(const DataLayout &DL, const LoadInst *LI) { + return DL.getValueOrABITypeAlignment(MaybeAlign(LI->getAlignment()), + LI->getType()); } -static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, - const LoadInst *LI) { - unsigned StoreAlign = findStoreAlignment(DL, SI); - unsigned LoadAlign = findLoadAlignment(DL, LI); - return MinAlign(StoreAlign, LoadAlign); +static Align findCommonAlignment(const DataLayout &DL, const StoreInst *SI, + const LoadInst *LI) { + Align StoreAlign = findStoreAlignment(DL, SI); + Align LoadAlign = findLoadAlignment(DL, LI); + return commonAlignment(StoreAlign, LoadAlign); } // This method try to lift a store instruction before position P. @@ -649,7 +642,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), - findCommonAlignment(DL, SI, LI), C); + findCommonAlignment(DL, SI, LI).value(), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -682,12 +675,11 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { auto *T = V->getType(); if (T->isAggregateType()) { uint64_t Size = DL.getTypeStoreSize(T); - unsigned Align = SI->getAlignment(); - if (!Align) - Align = DL.getABITypeAlignment(T); + const Align MA = + DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), T); IRBuilder<> Builder(SI); auto *M = - Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, Align); + Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, MA); LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); @@ -982,12 +974,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // example we could be moving from movaps -> movq on x86. IRBuilder<> Builder(M); if (UseMemMove) - Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), + Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), + MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); else - Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), + Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), + MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); // Remove the instruction we're replacing. @@ -1057,7 +1049,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, Builder.CreateMemSet( Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest, SrcSize), - MemSet->getOperand(1), MemsetLen, Align); + MemSet->getOperand(1), MemsetLen, MaybeAlign(Align)); MD->removeInstruction(MemSet); MemSet->eraseFromParent(); @@ -1125,8 +1117,8 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, } IRBuilder<> Builder(MemCpy); - Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), - CopySize, MemCpy->getDestAlignment()); + Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), CopySize, + MaybeAlign(MemCpy->getDestAlignment())); return true; } @@ -1153,7 +1145,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { M->getModule()->getDataLayout())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), - M->getDestAlignment(), false); + MaybeAlign(M->getDestAlignment()), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp index 98a45b391319..ce1e142101b8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -50,6 +50,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 9799ea7960ec..6b0d0202d9bb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -83,6 +83,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index 1260bd39cdee..bba9082e31b2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -82,7 +82,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -101,10 +100,12 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp index b213264de557..6a643480f312 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -76,7 +76,6 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -94,6 +93,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" @@ -106,6 +106,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVNExpression.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include "llvm/Transforms/Utils/VNCoercion.h" #include <algorithm> @@ -489,11 +490,11 @@ namespace { class NewGVN { Function &F; - DominatorTree *DT; - const TargetLibraryInfo *TLI; - AliasAnalysis *AA; - MemorySSA *MSSA; - MemorySSAWalker *MSSAWalker; + DominatorTree *DT = nullptr; + const TargetLibraryInfo *TLI = nullptr; + AliasAnalysis *AA = nullptr; + MemorySSA *MSSA = nullptr; + MemorySSAWalker *MSSAWalker = nullptr; const DataLayout &DL; std::unique_ptr<PredicateInfo> PredInfo; @@ -505,7 +506,7 @@ class NewGVN { const SimplifyQuery SQ; // Number of function arguments, used by ranking - unsigned int NumFuncArgs; + unsigned int NumFuncArgs = 0; // RPOOrdering of basic blocks DenseMap<const DomTreeNode *, unsigned> RPOOrdering; @@ -516,9 +517,9 @@ class NewGVN { // startsout in, and represents any value. Being an optimistic analysis, // anything in the TOP class has the value TOP, which is indeterminate and // equivalent to everything. - CongruenceClass *TOPClass; + CongruenceClass *TOPClass = nullptr; std::vector<CongruenceClass *> CongruenceClasses; - unsigned NextCongruenceNum; + unsigned NextCongruenceNum = 0; // Value Mappings. DenseMap<Value *, CongruenceClass *> ValueToClass; @@ -862,7 +863,7 @@ private: // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. - int64_t StartingVNCounter; + int64_t StartingVNCounter = 0; }; } // end anonymous namespace diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 68a0f5151ad5..58763ec72ece 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index beb299272ed8..5c4a89977c38 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -47,6 +47,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ADT/SetVector.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp index 124f625ef7b6..41940e980faa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -30,7 +30,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -50,12 +49,14 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <utility> @@ -173,7 +174,7 @@ void ReassociatePass::BuildRankMap(Function &F, << "\n"); } - // Traverse basic blocks in ReversePostOrder + // Traverse basic blocks in ReversePostOrder. for (BasicBlock *BB : RPOT) { unsigned BBRank = RankMap[BB] = ++Rank << 16; @@ -1898,6 +1899,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I, ValueRankMap.erase(I); Insts.remove(I); RedoInsts.remove(I); + llvm::salvageDebugInfoOrMarkUndef(*I); I->eraseFromParent(); for (auto Op : Ops) if (Instruction *OpInst = dyn_cast<Instruction>(Op)) @@ -1914,6 +1916,7 @@ void ReassociatePass::EraseInst(Instruction *I) { // Erase the dead instruction. ValueRankMap.erase(I); RedoInsts.remove(I); + llvm::salvageDebugInfoOrMarkUndef(*I); I->eraseFromParent(); // Optimize its operands. SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 3296322e00d5..0716c1320982 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -16,16 +16,17 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 48bbdd8d1b33..b242f100faff 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp index 10fbdc8aacd2..e696ea83a300 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -29,7 +29,6 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/IR/BasicBlock.h" @@ -49,12 +48,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include <cassert> #include <utility> @@ -2196,7 +2197,7 @@ bool llvm::runIPSCCP( findReturnsToZap(*F, ReturnsToZap, Solver); } - for (const auto &F : Solver.getMRVFunctionsTracked()) { + for (auto F : Solver.getMRVFunctionsTracked()) { assert(F->getReturnType()->isStructTy() && "The return type should be a struct"); StructType *STy = cast<StructType>(F->getReturnType()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp index 74b8ff913050..89916e43fce2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp @@ -41,7 +41,6 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -71,6 +70,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -80,6 +80,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> #include <cassert> @@ -361,7 +362,7 @@ private: /// The beginning and ending offsets of the alloca for this /// partition. - uint64_t BeginOffset, EndOffset; + uint64_t BeginOffset = 0, EndOffset = 0; /// The start and end iterators of this partition. iterator SI, SJ; @@ -1680,24 +1681,20 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, } /// Compute the adjusted alignment for a load or store from an offset. -static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, - const DataLayout &DL) { - unsigned Alignment; +static Align getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + MaybeAlign Alignment; Type *Ty; if (auto *LI = dyn_cast<LoadInst>(I)) { - Alignment = LI->getAlignment(); + Alignment = MaybeAlign(LI->getAlignment()); Ty = LI->getType(); } else if (auto *SI = dyn_cast<StoreInst>(I)) { - Alignment = SI->getAlignment(); + Alignment = MaybeAlign(SI->getAlignment()); Ty = SI->getValueOperand()->getType(); } else { llvm_unreachable("Only loads and stores are allowed!"); } - - if (!Alignment) - Alignment = DL.getABITypeAlignment(Ty); - - return MinAlign(Alignment, Offset); + return commonAlignment(DL.getValueOrABITypeAlignment(Alignment, Ty), Offset); } /// Test whether we can convert a value from the old to the new type. @@ -2300,9 +2297,9 @@ class llvm::sroa::AllocaSliceRewriter // The new offsets of the slice currently being rewritten relative to the // original alloca. - uint64_t NewBeginOffset, NewEndOffset; + uint64_t NewBeginOffset = 0, NewEndOffset = 0; - uint64_t SliceSize; + uint64_t SliceSize = 0; bool IsSplittable = false; bool IsSplit = false; Use *OldUse = nullptr; @@ -2432,13 +2429,14 @@ private: /// /// You can optionally pass a type to this routine and if that type's ABI /// alignment is itself suitable, this will return zero. - unsigned getSliceAlign(Type *Ty = nullptr) { - unsigned NewAIAlign = NewAI.getAlignment(); - if (!NewAIAlign) - NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - unsigned Align = - MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); - return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; + MaybeAlign getSliceAlign(Type *Ty = nullptr) { + const MaybeAlign NewAIAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(NewAI.getAlignment()), NewAI.getAllocatedType()); + const MaybeAlign Align = + commonAlignment(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + return (Ty && Align && Align->value() == DL.getABITypeAlignment(Ty)) + ? None + : Align; } unsigned getIndex(uint64_t Offset) { @@ -2800,7 +2798,7 @@ private: Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, - getSliceAlign(), II.isVolatile()); + MaybeAlign(getSliceAlign()), II.isVolatile()); if (AATags) New->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); @@ -2886,7 +2884,7 @@ private: assert((IsDest && II.getRawDest() == OldPtr) || (!IsDest && II.getRawSource() == OldPtr)); - unsigned SliceAlign = getSliceAlign(); + MaybeAlign SliceAlign = getSliceAlign(); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -2956,10 +2954,10 @@ private: // Compute the relative offset for the other pointer within the transfer. unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS); APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset); - unsigned OtherAlign = - IsDest ? II.getSourceAlignment() : II.getDestAlignment(); - OtherAlign = MinAlign(OtherAlign ? OtherAlign : 1, - OtherOffset.zextOrTrunc(64).getZExtValue()); + Align OtherAlign = + assumeAligned(IsDest ? II.getSourceAlignment() : II.getDestAlignment()); + OtherAlign = + commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue()); if (EmitMemCpy) { // Compute the other pointer, folding as much as possible to produce @@ -2972,7 +2970,7 @@ private: Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); Value *DestPtr, *SrcPtr; - unsigned DestAlign, SrcAlign; + MaybeAlign DestAlign, SrcAlign; // Note: IsDest is true iff we're copying into the new alloca slice if (IsDest) { DestPtr = OurPtr; @@ -3019,9 +3017,9 @@ private: Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, OtherPtr->getName() + "."); - unsigned SrcAlign = OtherAlign; + MaybeAlign SrcAlign = OtherAlign; Value *DstPtr = &NewAI; - unsigned DstAlign = SliceAlign; + MaybeAlign DstAlign = SliceAlign; if (!IsDest) { std::swap(SrcPtr, DstPtr); std::swap(SrcAlign, DstAlign); @@ -3117,20 +3115,17 @@ private: Instruction *I = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - unsigned LoadAlign = LI->getAlignment(); - if (!LoadAlign) - LoadAlign = DL.getABITypeAlignment(LI->getType()); - LI->setAlignment(MaybeAlign(std::min(LoadAlign, getSliceAlign()))); + MaybeAlign LoadAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(LI->getAlignment()), LI->getType()); + LI->setAlignment(std::min(LoadAlign, getSliceAlign())); continue; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - unsigned StoreAlign = SI->getAlignment(); - if (!StoreAlign) { Value *Op = SI->getOperand(0); - StoreAlign = DL.getABITypeAlignment(Op->getType()); - } - SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign()))); - continue; + MaybeAlign StoreAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(SI->getAlignment()), Op->getType()); + SI->setAlignment(std::min(StoreAlign, getSliceAlign())); + continue; } assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) || @@ -3222,7 +3217,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { /// The current pointer use being rewritten. This is used to dig up the used /// value (as opposed to the user). - Use *U; + Use *U = nullptr; /// Used to calculate offsets, and hence alignment, of subobjects. const DataLayout &DL; @@ -3277,7 +3272,7 @@ private: Type *BaseTy; /// Known alignment of the base pointer. - unsigned BaseAlign; + Align BaseAlign; /// To calculate offset of each component so we can correctly deduce /// alignments. @@ -3286,7 +3281,7 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - unsigned BaseAlign, const DataLayout &DL) + Align BaseAlign, const DataLayout &DL) : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {} @@ -3308,7 +3303,7 @@ private: if (Ty->isSingleValueType()) { unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices); return static_cast<Derived *>(this)->emitFunc( - Ty, Agg, MinAlign(BaseAlign, Offset), Name); + Ty, Agg, commonAlignment(BaseAlign, Offset), Name); } if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { @@ -3349,18 +3344,20 @@ private: AAMDNodes AATags; LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, - DL), AATags(AATags) {} + DL), + AATags(AATags) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. - void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { assert(Ty->isSingleValueType()); // Load the single value and insert it using the indices. Value *GEP = IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); - LoadInst *Load = IRB.CreateAlignedLoad(Ty, GEP, Align, Name + ".load"); + LoadInst *Load = + IRB.CreateAlignedLoad(Ty, GEP, Alignment.value(), Name + ".load"); if (AATags) Load->setAAMetadata(AATags); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); @@ -3388,14 +3385,14 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL), AATags(AATags) {} AAMDNodes AATags; /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. - void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. // @@ -3406,7 +3403,7 @@ private: Value *InBoundsGEP = IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); StoreInst *Store = - IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Align); + IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment.value()); if (AATags) Store->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); @@ -3863,8 +3860,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, BasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, BasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, - LI->getName()); + getAdjustedAlignment(LI, PartOffset, DL).value(), + /*IsVolatile*/ false, LI->getName()); PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); @@ -3921,7 +3918,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL).value(), + /*IsVolatile*/ false); PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); @@ -4005,8 +4003,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), LoadPartPtrTy, LoadBasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, - LI->getName()); + getAdjustedAlignment(LI, PartOffset, DL).value(), + /*IsVolatile*/ false, LI->getName()); } // And store this partition. @@ -4017,7 +4015,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), StorePartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL).value(), + /*IsVolatile*/ false); // Now build a new slice for the alloca. NewSlices.push_back( @@ -4152,20 +4151,19 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // FIXME: We might want to defer PHI speculation until after here. // FIXME: return nullptr; } else { - unsigned Alignment = AI.getAlignment(); - if (!Alignment) { - // The minimum alignment which users can rely on when the explicit - // alignment is omitted or zero is that required by the ABI for this - // type. - Alignment = DL.getABITypeAlignment(AI.getAllocatedType()); - } - Alignment = MinAlign(Alignment, P.beginOffset()); + // If alignment is unspecified we fallback on the one required by the ABI + // for this type. We also make sure the alignment is compatible with + // P.beginOffset(). + const Align Alignment = commonAlignment( + DL.getValueOrABITypeAlignment(MaybeAlign(AI.getAlignment()), + AI.getAllocatedType()), + P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= DL.getABITypeAlignment(SliceTy)) - Alignment = 0; + const bool IsUnconstrained = Alignment <= DL.getABITypeAlignment(SliceTy); NewAI = new AllocaInst( - SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment, + SliceTy, AI.getType()->getAddressSpace(), nullptr, + IsUnconstrained ? MaybeAlign() : Alignment, AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); // Copy the old AI debug location over to the new one. NewAI->setDebugLoc(AI.getDebugLoc()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp index 1d2e40bf62be..9d088547b436 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -82,6 +82,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLowerConstantIntrinsicsPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); + initializeLowerMatrixIntrinsicsLegacyPassPass(Registry); initializeLowerWidenableConditionLegacyPassPass(Registry); initializeMemCpyOptLegacyPassPass(Registry); initializeMergeICmpsLegacyPassPass(Registry); @@ -89,6 +90,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); + initializeRedundantDbgInstEliminationPass(Registry); initializeRegToMemPass(Registry); initializeRewriteStatepointsForGCLegacyPassPass(Registry); initializeSCCPLegacyPassPass(Registry); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 2ee1a3a95f2a..c25c6c632b8f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" @@ -21,6 +22,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -33,12 +35,12 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/Options.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/Scalarizer.h" #include <cassert> #include <cstdint> #include <iterator> @@ -173,8 +175,8 @@ struct VectorLayout { class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> { public: - ScalarizerVisitor(unsigned ParallelLoopAccessMDKind) - : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind) { + ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT) + : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) { } bool visit(Function &F); @@ -214,6 +216,8 @@ private: GatherList Gathered; unsigned ParallelLoopAccessMDKind; + + DominatorTree *DT; }; class ScalarizerLegacyPass : public FunctionPass { @@ -225,6 +229,11 @@ public: } bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } }; } // end anonymous namespace @@ -232,6 +241,7 @@ public: char ScalarizerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) @@ -303,7 +313,8 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) { Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); return Impl.visit(F); } @@ -340,6 +351,15 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { return Scatterer(BB, BB->begin(), V, &Scattered[V]); } if (Instruction *VOp = dyn_cast<Instruction>(V)) { + // When scalarizing PHI nodes we might try to examine/rewrite InsertElement + // nodes in predecessors. If those predecessors are unreachable from entry, + // then the IR in those blocks could have unexpected properties resulting in + // infinite loops in Scatterer::operator[]. By simply treating values + // originating from instructions in unreachable blocks as undef we do not + // need to analyse them further. + if (!DT->isReachableFromEntry(VOp->getParent())) + return Scatterer(Point->getParent(), Point->getIterator(), + UndefValue::get(V->getType())); // Put the scattered form of an instruction directly after the // instruction. BasicBlock *BB = VOp->getParent(); @@ -856,7 +876,10 @@ PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); bool Changed = Impl.visit(F); - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return Changed ? PA : PreservedAnalyses::all(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 41554fccdf08..2a1a040bf83e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -164,7 +164,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -182,6 +181,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -189,6 +189,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <string> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index ac832b9b4567..d7a34acb4318 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -38,8 +38,10 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTree.h" @@ -263,7 +265,7 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, /// to an entirely separate nest. static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, DominatorTree &DT, LoopInfo &LI, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { // If the loop is already at the top level, we can't hoist it anywhere. Loop *OldParentL = L.getParentLoop(); if (!OldParentL) @@ -317,7 +319,7 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, // Because we just hoisted a loop out of this one, we have essentially // created new exit paths from it. That means we need to form LCSSA PHI // nodes for values used in the no-longer-nested loop. - formLCSSA(*OldContainingL, DT, &LI, nullptr); + formLCSSA(*OldContainingL, DT, &LI, SE); // We shouldn't need to form dedicated exits because the exit introduced // here is the (just split by unswitching) preheader. However, after trivial @@ -329,6 +331,20 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, } } +// Return the top-most loop containing ExitBB and having ExitBB as exiting block +// or the loop containing ExitBB, if there is no parent loop containing ExitBB +// as exiting block. +static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) { + Loop *TopMost = LI.getLoopFor(ExitBB); + Loop *Current = TopMost; + while (Current) { + if (Current->isLoopExiting(ExitBB)) + TopMost = Current; + Current = Current->getParentLoop(); + } + return TopMost; +} + /// Unswitch a trivial branch if the condition is loop invariant. /// /// This routine should only be called when loop code leading to the branch has @@ -413,9 +429,10 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, }); // If we have scalar evolutions, we need to invalidate them including this - // loop and the loop containing the exit block. + // loop, the loop containing the exit block and the topmost parent loop + // exiting via LoopExitBB. if (SE) { - if (Loop *ExitL = LI.getLoopFor(LoopExitBB)) + if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI)) SE->forgetLoop(ExitL); else // Forget the entire nest as this exits the entire nest. @@ -532,7 +549,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // If this was full unswitching, we may have changed the nesting relationship // for this loop so hoist it to its correct parent if needed. if (FullUnswitch) - hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU); + hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -825,7 +842,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // We may have changed the nesting relationship for this loop so hoist it to // its correct parent if needed. - hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU); + hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -2260,7 +2277,7 @@ static void unswitchNontrivialInvariants( // First build LCSSA for this loop so that we can preserve it when // forming dedicated exits. We don't want to perturb some other loop's // LCSSA while doing that CFG edit. - formLCSSA(UpdateL, DT, &LI, nullptr); + formLCSSA(UpdateL, DT, &LI, SE); // For loops reached by this loop's original exit blocks we may // introduced new, non-dedicated exits. At least try to re-form dedicated @@ -2426,7 +2443,7 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, if (MSSAU) { MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI)); - MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::End); + MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator); if (VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 4544975a4887..623a8b711ed8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -35,10 +34,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Utils/Local.h" #include <utility> using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp index 90f3a2aa46e1..677d86f8c7b4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" @@ -78,7 +79,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, if (auto *Call = dyn_cast<CallBase>(Inst)) { // Convergent operations cannot be made control-dependent on additional // values. - if (Call->hasFnAttr(Attribute::Convergent)) + if (Call->isConvergent()) return false; for (Instruction *S : Stores) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp index e6db11f47ead..cd7bfb2f20dc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp @@ -283,12 +283,12 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; if (IID) - FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(), - IncomingC->getType()); + FoldedCost += TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(), + IncomingC->getType()); else FoldedCost += - TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(), - IncomingC->getType()); + TTI.getIntImmCostInst(UserI->getOpcode(), Idx, + IncomingC->getValue(), IncomingC->getType()); // If we accumulate more folded cost for this incoming constant than // materialized cost, then we'll regress any edge with this constant so diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index f9d027eb4a3b..c8d899bb4871 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -67,6 +67,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index a58c32cc5894..9f82b1263ebd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -60,7 +60,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -76,10 +75,12 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <limits> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 9791cf41f621..4ce4ce46f67a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -34,8 +34,10 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index b27a36b67d62..9f0ab9103d42 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -76,6 +76,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp index 707adf46d1f4..c8461fdc1608 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -12,6 +12,7 @@ #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp index ee0973002c47..0908b361a4d4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -63,6 +63,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -232,7 +233,7 @@ static bool addDiscriminators(Function &F) { LocationSet CallLocations; for (auto &I : B.getInstList()) { // We bypass intrinsic calls for the following two reasons: - // 1) We want to avoid a non-deterministic assigment of + // 1) We want to avoid a non-deterministic assignment of // discriminators. // 2) We want to minimize the number of base discriminators used. if (!isa<InvokeInst>(I) && (!isa<CallInst>(I) || isa<IntrinsicInst>(I))) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index d85cc40c372a..c9eb4abfa21a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -247,7 +247,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, Instruction *STI = BB->getTerminator(); Instruction *Start = &*BB->begin(); // If there's nothing to move, mark the starting instruction as the last - // instruction in the block. + // instruction in the block. Terminator instruction is handled separately. if (Start == STI) Start = PTI; @@ -274,24 +274,20 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, // Move terminator instruction. PredBB->getInstList().splice(PredBB->end(), BB->getInstList()); + + // Terminator may be a memory accessing instruction too. + if (MSSAU) + if (MemoryUseOrDef *MUD = cast_or_null<MemoryUseOrDef>( + MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator()))) + MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End); } // Add unreachable to now empty BB. new UnreachableInst(BB->getContext(), BB); - // Eliminate duplicate dbg.values describing the entry PHI node post-splice. - for (auto Incoming : IncomingValues) { - if (isa<Instruction>(*Incoming)) { - SmallVector<DbgValueInst *, 2> DbgValues; - SmallDenseSet<std::pair<DILocalVariable *, DIExpression *>, 2> - DbgValueSet; - llvm::findDbgValues(DbgValues, Incoming); - for (auto &DVI : DbgValues) { - auto R = DbgValueSet.insert({DVI->getVariable(), DVI->getExpression()}); - if (!R.second) - DVI->eraseFromParent(); - } - } - } + // Eliminate duplicate/redundant dbg.values. This seems to be a good place to + // do that since we might end up with redundant dbg.values describing the + // entry PHI node post-splice. + RemoveRedundantDbgInstrs(PredBB); // Inherit predecessors name if it exists. if (!PredBB->hasName()) @@ -318,6 +314,124 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, return true; } +/// Remove redundant instructions within sequences of consecutive dbg.value +/// instructions. This is done using a backward scan to keep the last dbg.value +/// describing a specific variable/fragment. +/// +/// BackwardScan strategy: +/// ---------------------- +/// Given a sequence of consecutive DbgValueInst like this +/// +/// dbg.value ..., "x", FragmentX1 (*) +/// dbg.value ..., "y", FragmentY1 +/// dbg.value ..., "x", FragmentX2 +/// dbg.value ..., "x", FragmentX1 (**) +/// +/// then the instruction marked with (*) can be removed (it is guaranteed to be +/// obsoleted by the instruction marked with (**) as the latter instruction is +/// describing the same variable using the same fragment info). +/// +/// Possible improvements: +/// - Check fully overlapping fragments and not only identical fragments. +/// - Support dbg.addr, dbg.declare. dbg.label, and possibly other meta +/// instructions being part of the sequence of consecutive instructions. +static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) { + SmallVector<DbgValueInst *, 8> ToBeRemoved; + SmallDenseSet<DebugVariable> VariableSet; + for (auto &I : reverse(*BB)) { + if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) { + DebugVariable Key(DVI->getVariable(), + DVI->getExpression(), + DVI->getDebugLoc()->getInlinedAt()); + auto R = VariableSet.insert(Key); + // If the same variable fragment is described more than once it is enough + // to keep the last one (i.e. the first found since we for reverse + // iteration). + if (!R.second) + ToBeRemoved.push_back(DVI); + continue; + } + // Sequence with consecutive dbg.value instrs ended. Clear the map to + // restart identifying redundant instructions if case we find another + // dbg.value sequence. + VariableSet.clear(); + } + + for (auto &Instr : ToBeRemoved) + Instr->eraseFromParent(); + + return !ToBeRemoved.empty(); +} + +/// Remove redundant dbg.value instructions using a forward scan. This can +/// remove a dbg.value instruction that is redundant due to indicating that a +/// variable has the same value as already being indicated by an earlier +/// dbg.value. +/// +/// ForwardScan strategy: +/// --------------------- +/// Given two identical dbg.value instructions, separated by a block of +/// instructions that isn't describing the same variable, like this +/// +/// dbg.value X1, "x", FragmentX1 (**) +/// <block of instructions, none being "dbg.value ..., "x", ..."> +/// dbg.value X1, "x", FragmentX1 (*) +/// +/// then the instruction marked with (*) can be removed. Variable "x" is already +/// described as being mapped to the SSA value X1. +/// +/// Possible improvements: +/// - Keep track of non-overlapping fragments. +static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) { + SmallVector<DbgValueInst *, 8> ToBeRemoved; + DenseMap<DebugVariable, std::pair<Value *, DIExpression *> > VariableMap; + for (auto &I : *BB) { + if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) { + DebugVariable Key(DVI->getVariable(), + NoneType(), + DVI->getDebugLoc()->getInlinedAt()); + auto VMI = VariableMap.find(Key); + // Update the map if we found a new value/expression describing the + // variable, or if the variable wasn't mapped already. + if (VMI == VariableMap.end() || + VMI->second.first != DVI->getValue() || + VMI->second.second != DVI->getExpression()) { + VariableMap[Key] = { DVI->getValue(), DVI->getExpression() }; + continue; + } + // Found an identical mapping. Remember the instruction for later removal. + ToBeRemoved.push_back(DVI); + } + } + + for (auto &Instr : ToBeRemoved) + Instr->eraseFromParent(); + + return !ToBeRemoved.empty(); +} + +bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) { + bool MadeChanges = false; + // By using the "backward scan" strategy before the "forward scan" strategy we + // can remove both dbg.value (2) and (3) in a situation like this: + // + // (1) dbg.value V1, "x", DIExpression() + // ... + // (2) dbg.value V2, "x", DIExpression() + // (3) dbg.value V1, "x", DIExpression() + // + // The backward scan will remove (2), it is made obsolete by (3). After + // getting (2) out of the way, the foward scan will remove (3) since "x" + // already is described as having the value V1 at (1). + MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB); + MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB); + + if (MadeChanges) + LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: " + << BB->getName() << "\n"); + return MadeChanges; +} + void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Value *V) { Instruction &I = *BI; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index f5e4b53f6d97..008cea333e6b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp index 3c7c8d872595..6b01c0c71d00 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp @@ -30,9 +30,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/CanonicalizeAliases.h" - #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp index 0298ff9a395f..682af4a88d3e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -805,7 +805,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, dbgs() << ")\n"; }); - StructType *StructTy; + StructType *StructTy = nullptr; if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { StructTy = StructType::get(M->getContext(), paramTy); paramTy.clear(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp new file mode 100644 index 000000000000..93395ac761ab --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp @@ -0,0 +1,189 @@ +//===- CodeMoverUtils.cpp - CodeMover Utilities ----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This family of functions perform movements on basic blocks, and instructions +// contained within a function. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/CodeMoverUtils.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" + +using namespace llvm; + +#define DEBUG_TYPE "codemover-utils" + +STATISTIC(HasDependences, + "Cannot move across instructions that has memory dependences"); +STATISTIC(MayThrowException, "Cannot move across instructions that may throw"); +STATISTIC(NotControlFlowEquivalent, + "Instructions are not control flow equivalent"); +STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported"); +STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported"); + +bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1, + const DominatorTree &DT, + const PostDominatorTree &PDT) { + return isControlFlowEquivalent(*I0.getParent(), *I1.getParent(), DT, PDT); +} + +bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1, + const DominatorTree &DT, + const PostDominatorTree &PDT) { + if (&BB0 == &BB1) + return true; + + return ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) || + (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0))); +} + +static bool reportInvalidCandidate(const Instruction &I, + llvm::Statistic &Stat) { + ++Stat; + LLVM_DEBUG(dbgs() << "Unable to move instruction: " << I << ". " + << Stat.getDesc()); + return false; +} + +/// Collect all instructions in between \p StartInst and \p EndInst, and store +/// them in \p InBetweenInsts. +static void +collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst, + SmallPtrSetImpl<Instruction *> &InBetweenInsts) { + assert(InBetweenInsts.empty() && "Expecting InBetweenInsts to be empty"); + + /// Get the next instructions of \p I, and push them to \p WorkList. + auto getNextInsts = [](Instruction &I, + SmallPtrSetImpl<Instruction *> &WorkList) { + if (Instruction *NextInst = I.getNextNode()) + WorkList.insert(NextInst); + else { + assert(I.isTerminator() && "Expecting a terminator instruction"); + for (BasicBlock *Succ : successors(&I)) + WorkList.insert(&Succ->front()); + } + }; + + SmallPtrSet<Instruction *, 10> WorkList; + getNextInsts(StartInst, WorkList); + while (!WorkList.empty()) { + Instruction *CurInst = *WorkList.begin(); + WorkList.erase(CurInst); + + if (CurInst == &EndInst) + continue; + + if (!InBetweenInsts.insert(CurInst).second) + continue; + + getNextInsts(*CurInst, WorkList); + } +} + +bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, + const DominatorTree &DT, + const PostDominatorTree &PDT, + DependenceInfo &DI) { + // Cannot move itself before itself. + if (&I == &InsertPoint) + return false; + + // Not moved. + if (I.getNextNode() == &InsertPoint) + return true; + + if (isa<PHINode>(I) || isa<PHINode>(InsertPoint)) + return reportInvalidCandidate(I, NotMovedPHINode); + + if (I.isTerminator()) + return reportInvalidCandidate(I, NotMovedTerminator); + + // TODO remove this limitation. + if (!isControlFlowEquivalent(I, InsertPoint, DT, PDT)) + return reportInvalidCandidate(I, NotControlFlowEquivalent); + + // As I and InsertPoint are control flow equivalent, if I dominates + // InsertPoint, then I comes before InsertPoint. + const bool MoveForward = DT.dominates(&I, &InsertPoint); + if (MoveForward) { + // When I is being moved forward, we need to make sure the InsertPoint + // dominates every users. Or else, a user may be using an undefined I. + for (const Use &U : I.uses()) + if (auto *UserInst = dyn_cast<Instruction>(U.getUser())) + if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U)) + return false; + } else { + // When I is being moved backward, we need to make sure all its opernads + // dominates the InsertPoint. Or else, an operand may be undefined for I. + for (const Value *Op : I.operands()) + if (auto *OpInst = dyn_cast<Instruction>(Op)) + if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint)) + return false; + } + + Instruction &StartInst = (MoveForward ? I : InsertPoint); + Instruction &EndInst = (MoveForward ? InsertPoint : I); + SmallPtrSet<Instruction *, 10> InstsToCheck; + collectInstructionsInBetween(StartInst, EndInst, InstsToCheck); + if (!MoveForward) + InstsToCheck.insert(&InsertPoint); + + // Check if there exists instructions which may throw, may synchonize, or may + // never return, from I to InsertPoint. + if (!isSafeToSpeculativelyExecute(&I)) + if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(), + [](Instruction *I) { + if (I->mayThrow()) + return true; + + const CallBase *CB = dyn_cast<CallBase>(I); + if (!CB) + return false; + if (!CB->hasFnAttr(Attribute::WillReturn)) + return true; + if (!CB->hasFnAttr(Attribute::NoSync)) + return true; + + return false; + })) { + return reportInvalidCandidate(I, MayThrowException); + } + + // Check if I has any output/flow/anti dependences with instructions from \p + // StartInst to \p EndInst. + if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(), + [&DI, &I](Instruction *CurInst) { + auto DepResult = DI.depends(&I, CurInst, true); + if (DepResult && + (DepResult->isOutput() || DepResult->isFlow() || + DepResult->isAnti())) + return true; + return false; + })) + return reportInvalidCandidate(I, HasDependences); + + return true; +} + +void llvm::moveInstsBottomUp(BasicBlock &FromBB, BasicBlock &ToBB, + const DominatorTree &DT, + const PostDominatorTree &PDT, DependenceInfo &DI) { + for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) { + Instruction *MovePos = ToBB.getFirstNonPHIOrDbg(); + Instruction &I = *It; + // Increment the iterator before modifying FromBB. + ++It; + + if (isSafeToMoveBefore(I, *MovePos, DT, PDT, DI)) + I.moveBefore(MovePos); + } +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp new file mode 100644 index 000000000000..b7b4bfa3734d --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp @@ -0,0 +1,435 @@ +//===- Debugify.cpp - Attach synthetic debug info to everything -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This pass attaches synthetic debug info to everything. It can be used +/// to create targeted tests for debug info preservation. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Debugify.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +namespace { + +cl::opt<bool> Quiet("debugify-quiet", + cl::desc("Suppress verbose debugify output")); + +raw_ostream &dbg() { return Quiet ? nulls() : errs(); } + +uint64_t getAllocSizeInBits(Module &M, Type *Ty) { + return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0; +} + +bool isFunctionSkipped(Function &F) { + return F.isDeclaration() || !F.hasExactDefinition(); +} + +/// Find the basic block's terminating instruction. +/// +/// Special care is needed to handle musttail and deopt calls, as these behave +/// like (but are in fact not) terminators. +Instruction *findTerminatingInstruction(BasicBlock &BB) { + if (auto *I = BB.getTerminatingMustTailCall()) + return I; + if (auto *I = BB.getTerminatingDeoptimizeCall()) + return I; + return BB.getTerminator(); +} + +bool applyDebugifyMetadata(Module &M, + iterator_range<Module::iterator> Functions, + StringRef Banner) { + // Skip modules with debug info. + if (M.getNamedMetadata("llvm.dbg.cu")) { + dbg() << Banner << "Skipping module with debug info\n"; + return false; + } + + DIBuilder DIB(M); + LLVMContext &Ctx = M.getContext(); + + // Get a DIType which corresponds to Ty. + DenseMap<uint64_t, DIType *> TypeCache; + auto getCachedDIType = [&](Type *Ty) -> DIType * { + uint64_t Size = getAllocSizeInBits(M, Ty); + DIType *&DTy = TypeCache[Size]; + if (!DTy) { + std::string Name = "ty" + utostr(Size); + DTy = DIB.createBasicType(Name, Size, dwarf::DW_ATE_unsigned); + } + return DTy; + }; + + unsigned NextLine = 1; + unsigned NextVar = 1; + auto File = DIB.createFile(M.getName(), "/"); + auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify", + /*isOptimized=*/true, "", 0); + + // Visit each instruction. + for (Function &F : Functions) { + if (isFunctionSkipped(F)) + continue; + + auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None)); + DISubprogram::DISPFlags SPFlags = + DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized; + if (F.hasPrivateLinkage() || F.hasInternalLinkage()) + SPFlags |= DISubprogram::SPFlagLocalToUnit; + auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine, + SPType, NextLine, DINode::FlagZero, SPFlags); + F.setSubprogram(SP); + for (BasicBlock &BB : F) { + // Attach debug locations. + for (Instruction &I : BB) + I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP)); + + // Inserting debug values into EH pads can break IR invariants. + if (BB.isEHPad()) + continue; + + // Find the terminating instruction, after which no debug values are + // attached. + Instruction *LastInst = findTerminatingInstruction(BB); + assert(LastInst && "Expected basic block with a terminator"); + + // Maintain an insertion point which can't be invalidated when updates + // are made. + BasicBlock::iterator InsertPt = BB.getFirstInsertionPt(); + assert(InsertPt != BB.end() && "Expected to find an insertion point"); + Instruction *InsertBefore = &*InsertPt; + + // Attach debug values. + for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) { + // Skip void-valued instructions. + if (I->getType()->isVoidTy()) + continue; + + // Phis and EH pads must be grouped at the beginning of the block. + // Only advance the insertion point when we finish visiting these. + if (!isa<PHINode>(I) && !I->isEHPad()) + InsertBefore = I->getNextNode(); + + std::string Name = utostr(NextVar++); + const DILocation *Loc = I->getDebugLoc().get(); + auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(), + getCachedDIType(I->getType()), + /*AlwaysPreserve=*/true); + DIB.insertDbgValueIntrinsic(I, LocalVar, DIB.createExpression(), Loc, + InsertBefore); + } + } + DIB.finalizeSubprogram(SP); + } + DIB.finalize(); + + // Track the number of distinct lines and variables. + NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify"); + auto *IntTy = Type::getInt32Ty(Ctx); + auto addDebugifyOperand = [&](unsigned N) { + NMD->addOperand(MDNode::get( + Ctx, ValueAsMetadata::getConstant(ConstantInt::get(IntTy, N)))); + }; + addDebugifyOperand(NextLine - 1); // Original number of lines. + addDebugifyOperand(NextVar - 1); // Original number of variables. + assert(NMD->getNumOperands() == 2 && + "llvm.debugify should have exactly 2 operands!"); + + // Claim that this synthetic debug info is valid. + StringRef DIVersionKey = "Debug Info Version"; + if (!M.getModuleFlag(DIVersionKey)) + M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION); + + return true; +} + +/// Return true if a mis-sized diagnostic is issued for \p DVI. +bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) { + // The size of a dbg.value's value operand should match the size of the + // variable it corresponds to. + // + // TODO: This, along with a check for non-null value operands, should be + // promoted to verifier failures. + Value *V = DVI->getValue(); + if (!V) + return false; + + // For now, don't try to interpret anything more complicated than an empty + // DIExpression. Eventually we should try to handle OP_deref and fragments. + if (DVI->getExpression()->getNumElements()) + return false; + + Type *Ty = V->getType(); + uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty); + Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits(); + if (!ValueOperandSize || !DbgVarSize) + return false; + + bool HasBadSize = false; + if (Ty->isIntegerTy()) { + auto Signedness = DVI->getVariable()->getSignedness(); + if (Signedness && *Signedness == DIBasicType::Signedness::Signed) + HasBadSize = ValueOperandSize < *DbgVarSize; + } else { + HasBadSize = ValueOperandSize != *DbgVarSize; + } + + if (HasBadSize) { + dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize + << ", but its variable has size " << *DbgVarSize << ": "; + DVI->print(dbg()); + dbg() << "\n"; + } + return HasBadSize; +} + +bool checkDebugifyMetadata(Module &M, + iterator_range<Module::iterator> Functions, + StringRef NameOfWrappedPass, StringRef Banner, + bool Strip, DebugifyStatsMap *StatsMap) { + // Skip modules without debugify metadata. + NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify"); + if (!NMD) { + dbg() << Banner << "Skipping module without debugify metadata\n"; + return false; + } + + auto getDebugifyOperand = [&](unsigned Idx) -> unsigned { + return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0)) + ->getZExtValue(); + }; + assert(NMD->getNumOperands() == 2 && + "llvm.debugify should have exactly 2 operands!"); + unsigned OriginalNumLines = getDebugifyOperand(0); + unsigned OriginalNumVars = getDebugifyOperand(1); + bool HasErrors = false; + + // Track debug info loss statistics if able. + DebugifyStatistics *Stats = nullptr; + if (StatsMap && !NameOfWrappedPass.empty()) + Stats = &StatsMap->operator[](NameOfWrappedPass); + + BitVector MissingLines{OriginalNumLines, true}; + BitVector MissingVars{OriginalNumVars, true}; + for (Function &F : Functions) { + if (isFunctionSkipped(F)) + continue; + + // Find missing lines. + for (Instruction &I : instructions(F)) { + if (isa<DbgValueInst>(&I)) + continue; + + auto DL = I.getDebugLoc(); + if (DL && DL.getLine() != 0) { + MissingLines.reset(DL.getLine() - 1); + continue; + } + + if (!DL) { + dbg() << "ERROR: Instruction with empty DebugLoc in function "; + dbg() << F.getName() << " --"; + I.print(dbg()); + dbg() << "\n"; + HasErrors = true; + } + } + + // Find missing variables and mis-sized debug values. + for (Instruction &I : instructions(F)) { + auto *DVI = dyn_cast<DbgValueInst>(&I); + if (!DVI) + continue; + + unsigned Var = ~0U; + (void)to_integer(DVI->getVariable()->getName(), Var, 10); + assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable"); + bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI); + if (!HasBadSize) + MissingVars.reset(Var - 1); + HasErrors |= HasBadSize; + } + } + + // Print the results. + for (unsigned Idx : MissingLines.set_bits()) + dbg() << "WARNING: Missing line " << Idx + 1 << "\n"; + + for (unsigned Idx : MissingVars.set_bits()) + dbg() << "WARNING: Missing variable " << Idx + 1 << "\n"; + + // Update DI loss statistics. + if (Stats) { + Stats->NumDbgLocsExpected += OriginalNumLines; + Stats->NumDbgLocsMissing += MissingLines.count(); + Stats->NumDbgValuesExpected += OriginalNumVars; + Stats->NumDbgValuesMissing += MissingVars.count(); + } + + dbg() << Banner; + if (!NameOfWrappedPass.empty()) + dbg() << " [" << NameOfWrappedPass << "]"; + dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n'; + + // Strip the Debugify Metadata if required. + if (Strip) { + StripDebugInfo(M); + M.eraseNamedMetadata(NMD); + return true; + } + + return false; +} + +/// ModulePass for attaching synthetic debug info to everything, used with the +/// legacy module pass manager. +struct DebugifyModulePass : public ModulePass { + bool runOnModule(Module &M) override { + return applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: "); + } + + DebugifyModulePass() : ModulePass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. +}; + +/// FunctionPass for attaching synthetic debug info to instructions within a +/// single function, used with the legacy module pass manager. +struct DebugifyFunctionPass : public FunctionPass { + bool runOnFunction(Function &F) override { + Module &M = *F.getParent(); + auto FuncIt = F.getIterator(); + return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)), + "FunctionDebugify: "); + } + + DebugifyFunctionPass() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. +}; + +/// ModulePass for checking debug info inserted by -debugify, used with the +/// legacy module pass manager. +struct CheckDebugifyModulePass : public ModulePass { + bool runOnModule(Module &M) override { + return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass, + "CheckModuleDebugify", Strip, StatsMap); + } + + CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "", + DebugifyStatsMap *StatsMap = nullptr) + : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass), + StatsMap(StatsMap) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. + +private: + bool Strip; + StringRef NameOfWrappedPass; + DebugifyStatsMap *StatsMap; +}; + +/// FunctionPass for checking debug info inserted by -debugify-function, used +/// with the legacy module pass manager. +struct CheckDebugifyFunctionPass : public FunctionPass { + bool runOnFunction(Function &F) override { + Module &M = *F.getParent(); + auto FuncIt = F.getIterator(); + return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)), + NameOfWrappedPass, "CheckFunctionDebugify", + Strip, StatsMap); + } + + CheckDebugifyFunctionPass(bool Strip = false, + StringRef NameOfWrappedPass = "", + DebugifyStatsMap *StatsMap = nullptr) + : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass), + StatsMap(StatsMap) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. + +private: + bool Strip; + StringRef NameOfWrappedPass; + DebugifyStatsMap *StatsMap; +}; + +} // end anonymous namespace + +ModulePass *createDebugifyModulePass() { return new DebugifyModulePass(); } + +FunctionPass *createDebugifyFunctionPass() { + return new DebugifyFunctionPass(); +} + +PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) { + applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: "); + return PreservedAnalyses::all(); +} + +ModulePass *createCheckDebugifyModulePass(bool Strip, + StringRef NameOfWrappedPass, + DebugifyStatsMap *StatsMap) { + return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap); +} + +FunctionPass *createCheckDebugifyFunctionPass(bool Strip, + StringRef NameOfWrappedPass, + DebugifyStatsMap *StatsMap) { + return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap); +} + +PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M, + ModuleAnalysisManager &) { + checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false, + nullptr); + return PreservedAnalyses::all(); +} + +char DebugifyModulePass::ID = 0; +static RegisterPass<DebugifyModulePass> DM("debugify", + "Attach debug info to everything"); + +char CheckDebugifyModulePass::ID = 0; +static RegisterPass<CheckDebugifyModulePass> + CDM("check-debugify", "Check debug info from -debugify"); + +char DebugifyFunctionPass::ID = 0; +static RegisterPass<DebugifyFunctionPass> DF("debugify-function", + "Attach debug info to a function"); + +char CheckDebugifyFunctionPass::ID = 0; +static RegisterPass<CheckDebugifyFunctionPass> + CDF("check-debugify-function", "Check debug info from -debugify-function"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index 57e2ff0251a9..651f776a4915 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -13,6 +13,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index 76b4635ad501..26d48ee0d23f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -12,13 +12,16 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/FunctionImportUtils.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" using namespace llvm; /// Checks if we should import SGV as a definition, otherwise import as a /// declaration. bool FunctionImportGlobalProcessing::doImportAsDefinition( - const GlobalValue *SGV, SetVector<GlobalValue *> *GlobalsToImport) { + const GlobalValue *SGV) { + if (!isPerformingImport()) + return false; // Only import the globals requested for importing. if (!GlobalsToImport->count(const_cast<GlobalValue *>(SGV))) @@ -31,16 +34,8 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition( return true; } -bool FunctionImportGlobalProcessing::doImportAsDefinition( - const GlobalValue *SGV) { - if (!isPerformingImport()) - return false; - return FunctionImportGlobalProcessing::doImportAsDefinition(SGV, - GlobalsToImport); -} - bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( - const GlobalValue *SGV) { + const GlobalValue *SGV, ValueInfo VI) { assert(SGV->hasLocalLinkage()); // Both the imported references and the original local variable must // be promoted. @@ -65,7 +60,7 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( // (so the source file name and resulting GUID is the same). Find the one // in this module. auto Summary = ImportIndex.findSummaryInModule( - SGV->getGUID(), SGV->getParent()->getModuleIdentifier()); + VI, SGV->getParent()->getModuleIdentifier()); assert(Summary && "Missing summary for global value when exporting"); auto Linkage = Summary->linkage(); if (!GlobalValue::isLocalLinkage(Linkage)) { @@ -91,18 +86,15 @@ bool FunctionImportGlobalProcessing::isNonRenamableLocal( } #endif -std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV, - bool DoPromote) { +std::string +FunctionImportGlobalProcessing::getPromotedName(const GlobalValue *SGV) { + assert(SGV->hasLocalLinkage()); // For locals that must be promoted to global scope, ensure that // the promoted name uniquely identifies the copy in the original module, - // using the ID assigned during combined index creation. When importing, - // we rename all locals (not just those that are promoted) in order to - // avoid naming conflicts between locals imported from different modules. - if (SGV->hasLocalLinkage() && (DoPromote || isPerformingImport())) - return ModuleSummaryIndex::getGlobalNameForLocal( - SGV->getName(), - ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier())); - return SGV->getName(); + // using the ID assigned during combined index creation. + return ModuleSummaryIndex::getGlobalNameForLocal( + SGV->getName(), + ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier())); } GlobalValue::LinkageTypes @@ -229,6 +221,11 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { } } + // We should always have a ValueInfo (i.e. GV in index) for definitions when + // we are exporting, and also when importing that value. + assert(VI || GV.isDeclaration() || + (isPerformingImport() && !doImportAsDefinition(&GV))); + // Mark read/write-only variables which can be imported with specific // attribute. We can't internalize them now because IRMover will fail // to link variable definitions to their external declarations during @@ -238,27 +235,42 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { // If global value dead stripping is not enabled in summary then // propagateConstants hasn't been run. We can't internalize GV // in such case. - if (!GV.isDeclaration() && VI && ImportIndex.withGlobalValueDeadStripping()) { - const auto &SL = VI.getSummaryList(); - auto *GVS = SL.empty() ? nullptr : dyn_cast<GlobalVarSummary>(SL[0].get()); - // At this stage "maybe" is "definitely" - if (GVS && (GVS->maybeReadOnly() || GVS->maybeWriteOnly())) - cast<GlobalVariable>(&GV)->addAttribute("thinlto-internalize"); + if (!GV.isDeclaration() && VI && ImportIndex.withAttributePropagation()) { + if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) { + // We can have more than one local with the same GUID, in the case of + // same-named locals in different but same-named source files that were + // compiled in their respective directories (so the source file name + // and resulting GUID is the same). Find the one in this module. + // Handle the case where there is no summary found in this module. That + // can happen in the distributed ThinLTO backend, because the index only + // contains summaries from the source modules if they are being imported. + // We might have a non-null VI and get here even in that case if the name + // matches one in this module (e.g. weak or appending linkage). + auto *GVS = dyn_cast_or_null<GlobalVarSummary>( + ImportIndex.findSummaryInModule(VI, M.getModuleIdentifier())); + if (GVS && + (ImportIndex.isReadOnly(GVS) || ImportIndex.isWriteOnly(GVS))) { + V->addAttribute("thinlto-internalize"); + // Objects referenced by writeonly GV initializer should not be + // promoted, because there is no any kind of read access to them + // on behalf of this writeonly GV. To avoid promotion we convert + // GV initializer to 'zeroinitializer'. This effectively drops + // references in IR module (not in combined index), so we can + // ignore them when computing import. We do not export references + // of writeonly object. See computeImportForReferencedGlobals + if (ImportIndex.isWriteOnly(GVS)) + V->setInitializer(Constant::getNullValue(V->getValueType())); + } + } } - bool DoPromote = false; - if (GV.hasLocalLinkage() && - ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { + if (GV.hasLocalLinkage() && shouldPromoteLocalToGlobal(&GV, VI)) { // Save the original name string before we rename GV below. auto Name = GV.getName().str(); - // Once we change the name or linkage it is difficult to determine - // again whether we should promote since shouldPromoteLocalToGlobal needs - // to locate the summary (based on GUID from name and linkage). Therefore, - // use DoPromote result saved above. - GV.setName(getName(&GV, DoPromote)); - GV.setLinkage(getLinkage(&GV, DoPromote)); - if (!GV.hasLocalLinkage()) - GV.setVisibility(GlobalValue::HiddenVisibility); + GV.setName(getPromotedName(&GV)); + GV.setLinkage(getLinkage(&GV, /* DoPromote */ true)); + assert(!GV.hasLocalLinkage()); + GV.setVisibility(GlobalValue::HiddenVisibility); // If we are renaming a COMDAT leader, ensure that we record the COMDAT // for later renaming as well. This is required for COFF. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp index 34c32d9c0c98..4cfc9358499a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp @@ -10,13 +10,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/GuardUtils.h" +#include "llvm/Analysis/GuardUtils.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<uint32_t> PredicatePassBranchWeight( "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20), @@ -24,7 +28,7 @@ static cl::opt<uint32_t> PredicatePassBranchWeight( "reciprocal of this value (default = 1 << 20)")); void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, - CallInst *Guard) { + CallInst *Guard, bool UseWC) { OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt)); SmallVector<Value *, 4> Args(std::next(Guard->arg_begin()), Guard->arg_end()); @@ -60,4 +64,63 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, DeoptCall->setCallingConv(Guard->getCallingConv()); DeoptBlockTerm->eraseFromParent(); + + if (UseWC) { + // We want the guard to be expressed as explicit control flow, but still be + // widenable. For that, we add Widenable Condition intrinsic call to the + // guard's condition. + IRBuilder<> B(CheckBI); + auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, + {}, {}, nullptr, "widenable_cond"); + CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC, + "exiplicit_guard_cond")); + assert(isWidenableBranch(CheckBI) && "sanity check"); + } +} + + +void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) { + assert(isWidenableBranch(WidenableBR) && "precondition"); + + // The tempting trivially option is to produce something like this: + // br (and oldcond, newcond) where oldcond is assumed to contain a widenable + // condition, but that doesn't match the pattern parseWidenableBranch expects + // so we have to be more sophisticated. + + Use *C, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB); + if (!C) { + // br (wc()), ... form + IRBuilder<> B(WidenableBR); + WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get())); + } else { + // br (wc & C), ... form + IRBuilder<> B(WidenableBR); + C->set(B.CreateAnd(NewCond, C->get())); + Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition()); + // Condition is only guaranteed to dominate branch + WCAnd->moveBefore(WidenableBR); + } + assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy"); +} + +void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) { + assert(isWidenableBranch(WidenableBR) && "precondition"); + + Use *C, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB); + if (!C) { + // br (wc()), ... form + IRBuilder<> B(WidenableBR); + WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get())); + } else { + // br (wc & C), ... form + Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition()); + // Condition is only guaranteed to dominate branch + WCAnd->moveBefore(WidenableBR); + C->set(NewCond); + } + assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy"); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp new file mode 100644 index 000000000000..9192e74b9ace --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp @@ -0,0 +1,186 @@ +//===- InjectTLIMAppings.cpp - TLI to VFABI attribute injection ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Populates the VFABI attribute with the scalar-to-vector mappings +// from the TargetLibraryInfo. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/InjectTLIMappings.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "inject-tli-mappings" + +STATISTIC(NumCallInjected, + "Number of calls in which the mappings have been injected."); + +STATISTIC(NumVFDeclAdded, + "Number of function declarations that have been added."); +STATISTIC(NumCompUsedAdded, + "Number of `@llvm.compiler.used` operands that have been added."); + +/// Helper function to map the TLI name to a strings that holds +/// scalar-to-vector mapping. +/// +/// _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>) +/// +/// where: +/// +/// <isa> = "_LLVM_" +/// <mask> = "N". Note: TLI does not support masked interfaces. +/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor` +/// field of the `VecDesc` struct. +/// <vparams> = "v", as many as are the number of parameters of CI. +/// <scalarname> = the name of the scalar function called by CI. +/// <vectorname> = the name of the vector function mapped by the TLI. +static std::string mangleTLIName(StringRef VectorName, const CallInst &CI, + unsigned VF) { + SmallString<256> Buffer; + llvm::raw_svector_ostream Out(Buffer); + Out << "_ZGV" << VFABI::_LLVM_ << "N" << VF; + for (unsigned I = 0; I < CI.getNumArgOperands(); ++I) + Out << "v"; + Out << "_" << CI.getCalledFunction()->getName() << "(" << VectorName << ")"; + return Out.str(); +} + +/// A helper function for converting Scalar types to vector types. +/// If the incoming type is void, we return void. If the VF is 1, we return +/// the scalar type. +static Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, {VF, isScalable}); +} + +/// A helper function that adds the vector function declaration that +/// vectorizes the CallInst CI with a vectorization factor of VF +/// lanes. The TLI assumes that all parameters and the return type of +/// CI (other than void) need to be widened to a VectorType of VF +/// lanes. +static void addVariantDeclaration(CallInst &CI, const unsigned VF, + const StringRef VFName) { + Module *M = CI.getModule(); + + // Add function declaration. + Type *RetTy = ToVectorTy(CI.getType(), VF); + SmallVector<Type *, 4> Tys; + for (Value *ArgOperand : CI.arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + assert(!CI.getFunctionType()->isVarArg() && + "VarArg functions are not supported."); + FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false); + Function *VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFName, M); + VectorF->copyAttributesFrom(CI.getCalledFunction()); + ++NumVFDeclAdded; + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName + << "` of type " << *(VectorF->getType()) << "\n"); + + // Make function declaration (without a body) "sticky" in the IR by + // listing it in the @llvm.compiler.used intrinsic. + assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` " + "only on declarations."); + appendToCompilerUsed(*M, {VectorF}); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName + << "` to `@llvm.compiler.used`.\n"); + ++NumCompUsedAdded; +} + +static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { + // This is needed to make sure we don't query the TLI for calls to + // bitcast of function pointers, like `%call = call i32 (i32*, ...) + // bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i)`, + // as such calls make the `isFunctionVectorizable` raise an + // exception. + if (CI.isNoBuiltin() || !CI.getCalledFunction()) + return; + + const std::string ScalarName = CI.getCalledFunction()->getName(); + // Nothing to be done if the TLI thinks the function is not + // vectorizable. + if (!TLI.isFunctionVectorizable(ScalarName)) + return; + SmallVector<std::string, 8> Mappings; + VFABI::getVectorVariantNames(CI, Mappings); + Module *M = CI.getModule(); + const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(), + Mappings.end()); + // All VFs in the TLI are powers of 2. + for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF; + VF *= 2) { + const std::string TLIName = TLI.getVectorizedFunction(ScalarName, VF); + if (!TLIName.empty()) { + std::string MangledName = mangleTLIName(TLIName, CI, VF); + if (!OriginalSetOfMappings.count(MangledName)) { + Mappings.push_back(MangledName); + ++NumCallInjected; + } + Function *VariantF = M->getFunction(TLIName); + if (!VariantF) + addVariantDeclaration(CI, VF, TLIName); + } + } + + VFABI::setVectorVariantNames(&CI, Mappings); +} + +static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { + for (auto &I : instructions(F)) + if (auto CI = dyn_cast<CallInst>(&I)) + addMappingsFromTLI(TLI, *CI); + // Even if the pass adds IR attributes, the analyses are preserved. + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// New pass manager implementation. +//////////////////////////////////////////////////////////////////////////////// +PreservedAnalyses InjectTLIMappings::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F); + runImpl(TLI, F); + // Even if the pass adds IR attributes, the analyses are preserved. + return PreservedAnalyses::all(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Legacy PM Implementation. +//////////////////////////////////////////////////////////////////////////////// +bool InjectTLIMappingsLegacy::runOnFunction(Function &F) { + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + return runImpl(TLI, F); +} + +void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<TargetLibraryInfoWrapperPass>(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Legacy Pass manager initialization +//////////////////////////////////////////////////////////////////////////////// +char InjectTLIMappingsLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(InjectTLIMappingsLegacy, DEBUG_TYPE, + "Inject TLI Mappings", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(InjectTLIMappingsLegacy, DEBUG_TYPE, "Inject TLI Mappings", + false, false) + +FunctionPass *llvm::createInjectTLIMappingsLegacyPass() { + return new InjectTLIMappingsLegacy(); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp index a7f0f7ac5d61..6da612eb4e65 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1254,7 +1254,8 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, // Always generate a memcpy of alignment 1 here because we don't know // the alignment of the src pointer. Other optimizations can infer // better alignment. - Builder.CreateMemCpy(Dst, /*DstAlign*/1, Src, /*SrcAlign*/1, Size); + Builder.CreateMemCpy(Dst, /*DstAlign*/ Align::None(), Src, + /*SrcAlign*/ Align::None(), Size); } /// When inlining a call site that has a byval argument, @@ -1293,16 +1294,16 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, } // Create the alloca. If we have DataLayout, use nice alignment. - unsigned Align = DL.getPrefTypeAlignment(AggTy); + Align Alignment(DL.getPrefTypeAlignment(AggTy)); // If the byval had an alignment specified, we *must* use at least that // alignment, as it is required by the byval argument (and uses of the // pointer inside the callee). - Align = std::max(Align, ByValAlignment); + Alignment = max(Alignment, MaybeAlign(ByValAlignment)); - Value *NewAlloca = new AllocaInst(AggTy, DL.getAllocaAddrSpace(), - nullptr, Align, Arg->getName(), - &*Caller->begin()->begin()); + Value *NewAlloca = + new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment, + Arg->getName(), &*Caller->begin()->begin()); IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca)); // Uses of the argument in the function should use our new alloca @@ -1405,6 +1406,10 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, // other. DenseMap<const MDNode *, MDNode *> IANodes; + // Check if we are not generating inline line tables and want to use + // the call site location instead. + bool NoInlineLineTables = Fn->hasFnAttribute("no-inline-line-tables"); + for (; FI != Fn->end(); ++FI) { for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { @@ -1416,20 +1421,22 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, BI->setMetadata(LLVMContext::MD_loop, NewLoopID); } - if (DebugLoc DL = BI->getDebugLoc()) { - DebugLoc IDL = - inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes); - BI->setDebugLoc(IDL); - continue; - } + if (!NoInlineLineTables) + if (DebugLoc DL = BI->getDebugLoc()) { + DebugLoc IDL = + inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes); + BI->setDebugLoc(IDL); + continue; + } - if (CalleeHasDebugInfo) + if (CalleeHasDebugInfo && !NoInlineLineTables) continue; - // If the inlined instruction has no line number, make it look as if it - // originates from the call location. This is important for - // ((__always_inline__, __nodebug__)) functions which must use caller - // location for all instructions in their function body. + // If the inlined instruction has no line number, or if inline info + // is not being generated, make it look as if it originates from the call + // location. This is important for ((__always_inline, __nodebug__)) + // functions which must use caller location for all instructions in their + // function body. // Don't update static allocas, as they may get moved later. if (auto *AI = dyn_cast<AllocaInst>(BI)) @@ -1438,6 +1445,19 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, BI->setDebugLoc(TheCallDL); } + + // Remove debug info intrinsics if we're not keeping inline info. + if (NoInlineLineTables) { + BasicBlock::iterator BI = FI->begin(); + while (BI != FI->end()) { + if (isa<DbgInfoIntrinsic>(BI)) { + BI = BI->eraseFromParent(); + continue; + } + ++BI; + } + } + } } @@ -1453,7 +1473,7 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock, BlockFrequencyInfo *CalleeBFI, const BasicBlock &CalleeEntryBlock) { SmallPtrSet<BasicBlock *, 16> ClonedBBs; - for (auto const &Entry : VMap) { + for (auto Entry : VMap) { if (!isa<BasicBlock>(Entry.first) || !Entry.second) continue; auto *OrigBB = cast<BasicBlock>(Entry.first); @@ -1508,22 +1528,25 @@ void llvm::updateProfileCallee( else newEntryCount = priorEntryCount + entryDelta; - Callee->setEntryCount(newEntryCount); - // During inlining ? if (VMap) { uint64_t cloneEntryCount = priorEntryCount - newEntryCount; - for (auto const &Entry : *VMap) + for (auto Entry : *VMap) if (isa<CallInst>(Entry.first)) if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second)) CI->updateProfWeight(cloneEntryCount, priorEntryCount); } - for (BasicBlock &BB : *Callee) - // No need to update the callsite if it is pruned during inlining. - if (!VMap || VMap->count(&BB)) - for (Instruction &I : BB) - if (CallInst *CI = dyn_cast<CallInst>(&I)) - CI->updateProfWeight(newEntryCount, priorEntryCount); + + if (entryDelta) { + Callee->setEntryCount(newEntryCount); + + for (BasicBlock &BB : *Callee) + // No need to update the callsite if it is pruned during inlining. + if (!VMap || VMap->count(&BB)) + for (Instruction &I : BB) + if (CallInst *CI = dyn_cast<CallInst>(&I)) + CI->updateProfWeight(newEntryCount, priorEntryCount); + } } /// This function inlines the called function into the basic block of the @@ -1842,6 +1865,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Scan for the block of allocas that we can move over, and move them // all at once. while (isa<AllocaInst>(I) && + !cast<AllocaInst>(I)->use_empty() && allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) { IFI.StaticAllocas.push_back(cast<AllocaInst>(I)); ++I; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp index 6c4fc1ceb991..aac0b55801c4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp @@ -15,6 +15,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp index 29e7c5260f46..5746d69260d5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -43,7 +43,9 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PredIteratorCache.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -74,7 +76,8 @@ static bool isExitBlock(BasicBlock *BB, /// that are outside the current loop. If so, insert LCSSA PHI nodes and /// rewrite the uses. bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, - DominatorTree &DT, LoopInfo &LI) { + DominatorTree &DT, LoopInfo &LI, + ScalarEvolution *SE) { SmallVector<Use *, 16> UsesToRewrite; SmallSetVector<PHINode *, 16> PHIsToRemove; PredIteratorCache PredCache; @@ -134,6 +137,11 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, SSAUpdater SSAUpdate(&InsertedPHIs); SSAUpdate.Initialize(I->getType(), I->getName()); + // Force re-computation of I, as some users now need to use the new PHI + // node. + if (SE) + SE->forgetValue(I); + // Insert the LCSSA phi's into all of the exit blocks dominated by the // value, and add them to the Phi's map. for (BasicBlock *ExitBB : ExitBlocks) { @@ -192,9 +200,6 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, UserBB = PN->getIncomingBlock(*UseToRewrite); if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { - // Tell the VHs that the uses changed. This updates SCEV's caches. - if (UseToRewrite->get()->hasValueHandle()) - ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front()); UseToRewrite->set(&UserBB->front()); continue; } @@ -202,10 +207,6 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, // If we added a single PHI, it must dominate all uses and we can directly // rename it. if (AddedPHIs.size() == 1) { - // Tell the VHs that the uses changed. This updates SCEV's caches. - // We might call ValueIsRAUWd multiple times for the same value. - if (UseToRewrite->get()->hasValueHandle()) - ValueHandleBase::ValueIsRAUWd(*UseToRewrite, AddedPHIs[0]); UseToRewrite->set(AddedPHIs[0]); continue; } @@ -368,7 +369,7 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, Worklist.push_back(&I); } } - Changed = formLCSSAForInstructions(Worklist, DT, *LI); + Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE); // If we modified the code, remove any caches about the loop from SCEV to // avoid dangling entries. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp index ed28fffc22b5..4c52fac6f7cb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp index 5bcd05757ec1..b2d511c7c9a9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp @@ -1421,22 +1421,32 @@ bool llvm::LowerDbgDeclare(Function &F) { })) continue; - for (auto &AIUse : AI->uses()) { - User *U = AIUse.getUser(); - if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - if (AIUse.getOperandNo() == 1) - ConvertDebugDeclareToDebugValue(DDI, SI, DIB); - } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - } else if (CallInst *CI = dyn_cast<CallInst>(U)) { - // This is a call by-value or some other instruction that takes a - // pointer to the variable. Insert a *value* intrinsic that describes - // the variable by dereferencing the alloca. - DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr); - auto *DerefExpr = - DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref); - DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr, NewLoc, - CI); + SmallVector<const Value *, 8> WorkList; + WorkList.push_back(AI); + while (!WorkList.empty()) { + const Value *V = WorkList.pop_back_val(); + for (auto &AIUse : V->uses()) { + User *U = AIUse.getUser(); + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (AIUse.getOperandNo() == 1) + ConvertDebugDeclareToDebugValue(DDI, SI, DIB); + } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + ConvertDebugDeclareToDebugValue(DDI, LI, DIB); + } else if (CallInst *CI = dyn_cast<CallInst>(U)) { + // This is a call by-value or some other instruction that takes a + // pointer to the variable. Insert a *value* intrinsic that describes + // the variable by dereferencing the alloca. + if (!CI->isLifetimeStartOrEnd()) { + DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr); + auto *DerefExpr = + DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref); + DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr, + NewLoc, CI); + } + } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) { + if (BI->getType()->isPointerTy()) + WorkList.push_back(BI); + } } } DDI->eraseFromParent(); @@ -1611,6 +1621,11 @@ bool llvm::salvageDebugInfo(Instruction &I) { return salvageDebugInfoForDbgValues(I, DbgUsers); } +void llvm::salvageDebugInfoOrMarkUndef(Instruction &I) { + if (!salvageDebugInfo(I)) + replaceDbgUsesWithUndef(&I); +} + bool llvm::salvageDebugInfoForDbgValues( Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) { auto &Ctx = I.getContext(); @@ -1661,9 +1676,8 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, }; // initializer-list helper for applying operators to the source DIExpression. - auto applyOps = - [&](std::initializer_list<uint64_t> Opcodes) -> DIExpression * { - SmallVector<uint64_t, 8> Ops(Opcodes); + auto applyOps = [&](ArrayRef<uint64_t> Opcodes) -> DIExpression * { + SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end()); return doSalvage(Ops); }; @@ -1671,8 +1685,21 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, // No-op casts and zexts are irrelevant for debug info. if (CI->isNoopCast(DL) || isa<ZExtInst>(&I)) return SrcDIExpr; - return nullptr; - } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + + Type *Type = CI->getType(); + // Casts other than Trunc or SExt to scalar types cannot be salvaged. + if (Type->isVectorTy() || (!isa<TruncInst>(&I) && !isa<SExtInst>(&I))) + return nullptr; + + Value *FromValue = CI->getOperand(0); + unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits(); + unsigned ToTypeBitSize = Type->getScalarSizeInBits(); + + return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize, + isa<SExtInst>(&I))); + } + + if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { unsigned BitWidth = M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace()); // Rewrite a constant GEP into a DIExpression. @@ -1727,7 +1754,7 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, using DbgValReplacement = Optional<DIExpression *>; /// Point debug users of \p From to \p To using exprs given by \p RewriteExpr, -/// possibly moving/deleting users to prevent use-before-def. Returns true if +/// possibly moving/undefing users to prevent use-before-def. Returns true if /// changes are made. static bool rewriteDebugUsers( Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT, @@ -1740,7 +1767,7 @@ static bool rewriteDebugUsers( // Prevent use-before-def of To. bool Changed = false; - SmallPtrSet<DbgVariableIntrinsic *, 1> DeleteOrSalvage; + SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage; if (isa<Instruction>(&To)) { bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint; @@ -1755,14 +1782,14 @@ static bool rewriteDebugUsers( // Users which otherwise aren't dominated by the replacement value must // be salvaged or deleted. } else if (!DT.dominates(&DomPoint, DII)) { - DeleteOrSalvage.insert(DII); + UndefOrSalvage.insert(DII); } } } // Update debug users without use-before-def risk. for (auto *DII : Users) { - if (DeleteOrSalvage.count(DII)) + if (UndefOrSalvage.count(DII)) continue; LLVMContext &Ctx = DII->getContext(); @@ -1776,18 +1803,10 @@ static bool rewriteDebugUsers( Changed = true; } - if (!DeleteOrSalvage.empty()) { + if (!UndefOrSalvage.empty()) { // Try to salvage the remaining debug users. - Changed |= salvageDebugInfo(From); - - // Delete the debug users which weren't salvaged. - for (auto *DII : DeleteOrSalvage) { - if (DII->getVariableLocation() == &From) { - LLVM_DEBUG(dbgs() << "Erased UseBeforeDef: " << *DII << '\n'); - DII->eraseFromParent(); - Changed = true; - } - } + salvageDebugInfoOrMarkUndef(From); + Changed = true; } return Changed; @@ -1862,10 +1881,8 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To, return None; bool Signed = *Signedness == DIBasicType::Signedness::Signed; - dwarf::TypeKind TK = Signed ? dwarf::DW_ATE_signed : dwarf::DW_ATE_unsigned; - SmallVector<uint64_t, 8> Ops({dwarf::DW_OP_LLVM_convert, ToBits, TK, - dwarf::DW_OP_LLVM_convert, FromBits, TK}); - return DIExpression::appendToStack(DII.getExpression(), Ops); + return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits, + Signed); }; return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt); } @@ -2574,7 +2591,7 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, if (!NewTy->isPointerTy()) return; - unsigned BitWidth = DL.getIndexTypeSizeInBits(NewTy); + unsigned BitWidth = DL.getPointerTypeSizeInBits(NewTy); if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) { MDNode *NN = MDNode::get(OldLI.getContext(), None); NewLI.setMetadata(LLVMContext::MD_nonnull, NN); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 889ea5ca9970..c065e0269c64 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -76,6 +76,13 @@ private: }; } // end anonymous namespace +/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not +/// previously exist in the map, and the value was inserted. +static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) { + bool Inserted = VM.insert({K, V}).second; + assert(Inserted); + (void)Inserted; +} /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the /// old header into the preheader. If there were uses of the values produced by /// these instruction that were outside of the loop, we have to insert PHI nodes @@ -300,7 +307,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // For PHI nodes, the value available in OldPreHeader is just the // incoming value from OldPreHeader. for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); + InsertNewValueIntoMap(ValueMap, PN, + PN->getIncomingValueForBlock(OrigPreheader)); // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. @@ -358,13 +366,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value // in the map. - ValueMap[Inst] = V; + InsertNewValueIntoMap(ValueMap, Inst, V); if (!C->mayHaveSideEffects()) { C->deleteValue(); C = nullptr; } } else { - ValueMap[Inst] = C; + InsertNewValueIntoMap(ValueMap, Inst, C); } if (C) { // Otherwise, stick the new instruction into the new block! @@ -376,7 +384,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { AC->registerAssumption(II); // MemorySSA cares whether the cloned instruction was inserted or not, and // not whether it can be remapped to a simplified value. - ValueMapMSSA[Inst] = C; + if (MSSAU) + InsertNewValueIntoMap(ValueMapMSSA, Inst, C); } } @@ -396,7 +405,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // Update MemorySSA before the rewrite call below changes the 1:1 // instruction:cloned_instruction_or_value mapping. if (MSSAU) { - ValueMapMSSA[OrigHeader] = OrigPreheader; + InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, ValueMapMSSA); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp index d0f89dc54bfb..28f88f39a712 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -67,6 +67,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp index a7590fc32545..4b94b371e70a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -22,17 +22,18 @@ #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp index bf2e87b0d49f..f1965934b2d7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -21,7 +21,6 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" @@ -177,6 +176,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // When we enter here we should have already checked that it is safe BasicBlock *Header = L->getHeader(); + assert(Header && "No header."); assert(L->getSubLoops().size() == 1); Loop *SubLoop = *L->begin(); @@ -247,8 +247,9 @@ LoopUnrollResult llvm::UnrollAndJamLoop( BasicBlock *Preheader = L->getLoopPreheader(); BasicBlock *LatchBlock = L->getLoopLatch(); + assert(Preheader && "No preheader"); + assert(LatchBlock && "No latch block"); BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); - assert(Preheader && LatchBlock && Header); assert(BI && !BI->isUnconditional()); bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp index 58e42074f963..7a168ff6f32b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -212,14 +212,11 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV); // Avoid huge SCEV computations in the loop below, make sure we only - // consider AddRecs of the loop we are trying to peel and avoid - // non-monotonic predicates, as we will not be able to simplify the loop - // body. - // FIXME: For the non-monotonic predicates ICMP_EQ and ICMP_NE we can - // simplify the loop, if we peel 1 additional iteration, if there - // is no wrapping. + // consider AddRecs of the loop we are trying to peel. + if (!LeftAR->isAffine() || LeftAR->getLoop() != &L) + continue; bool Increasing; - if (!LeftAR->isAffine() || LeftAR->getLoop() != &L || + if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) && !SE.isMonotonicPredicate(LeftAR, Pred, Increasing)) continue; (void)Increasing; @@ -238,18 +235,43 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, Pred = ICmpInst::getInversePredicate(Pred); const SCEV *Step = LeftAR->getStepRecurrence(SE); - while (NewPeelCount < MaxPeelCount && - SE.isKnownPredicate(Pred, IterVal, RightSCEV)) { - IterVal = SE.getAddExpr(IterVal, Step); + const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step); + auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step, + &NewPeelCount]() { + IterVal = NextIterVal; + NextIterVal = SE.getAddExpr(IterVal, Step); NewPeelCount++; + }; + + auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() { + return NewPeelCount < MaxPeelCount; + }; + + while (CanPeelOneMoreIteration() && + SE.isKnownPredicate(Pred, IterVal, RightSCEV)) + PeelOneMoreIteration(); + + // With *that* peel count, does the predicate !Pred become known in the + // first iteration of the loop body after peeling? + if (!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal, + RightSCEV)) + continue; // If not, give up. + + // However, for equality comparisons, that isn't always sufficient to + // eliminate the comparsion in loop body, we may need to peel one more + // iteration. See if that makes !Pred become unknown again. + if (ICmpInst::isEquality(Pred) && + !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal, + RightSCEV)) { + assert(!SE.isKnownPredicate(Pred, IterVal, RightSCEV) && + SE.isKnownPredicate(Pred, NextIterVal, RightSCEV) && + "Expected Pred to go from known to unknown."); + if (!CanPeelOneMoreIteration()) + continue; // Need to peel one more iteration, but can't. Give up. + PeelOneMoreIteration(); // Great! } - // Only peel the loop if the monotonic predicate !Pred becomes known in the - // first iteration of the loop body after peeling. - if (NewPeelCount > DesiredPeelCount && - SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal, - RightSCEV)) - DesiredPeelCount = NewPeelCount; + DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount); } return DesiredPeelCount; @@ -562,7 +584,7 @@ static void cloneLoopBlocks( // LastValueMap is updated with the values for the current loop // which are used the next time this function is called. - for (const auto &KV : VMap) + for (auto KV : VMap) LVMap[KV.first] = KV.second; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index d22fdb4d52dc..ddb7479924bd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils.h" @@ -395,9 +396,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, } } if (CreateRemainderLoop) { - Loop *NewLoop = NewLoops[L]; - MDNode *LoopID = NewLoop->getLoopID(); + Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); + MDNode *LoopID = NewLoop->getLoopID(); // Only add loop metadata if the loop is not going to be completely // unrolled. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp index b4d7f35d2d9a..c4c40189fda4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -24,7 +24,6 @@ #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -35,6 +34,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" @@ -672,7 +672,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, LI->removeBlock(BB); // The last step is to update LoopInfo now that we've eliminated this loop. - LI->erase(L); + // Note: LoopInfo::erase remove the given loop and relink its subloops with + // its parent. While removeLoop/removeChildLoop remove the given loop but + // not relink its subloops, which is what we want. + if (Loop *ParentLoop = L->getParentLoop()) { + Loop::iterator I = find(ParentLoop->begin(), ParentLoop->end(), L); + assert(I != ParentLoop->end() && "Couldn't find loop"); + ParentLoop->removeChildLoop(I); + } else { + Loop::iterator I = find(LI->begin(), LI->end(), L); + assert(I != LI->end() && "Couldn't find loop"); + LI->removeLoop(I); + } + LI->destroy(L); } } @@ -702,19 +714,19 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) { // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. - uint64_t TrueVal, FalseVal; - if (!LatchBR->extractProfMetadata(TrueVal, FalseVal)) + uint64_t BackedgeTakenWeight, LatchExitWeight; + if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) return None; - if (!TrueVal || !FalseVal) + if (LatchBR->getSuccessor(0) != L->getHeader()) + std::swap(BackedgeTakenWeight, LatchExitWeight); + + if (!BackedgeTakenWeight || !LatchExitWeight) return 0; // Divide the count of the backedge by the count of the edge exiting the loop, // rounding to nearest. - if (LatchBR->getSuccessor(0) == L->getHeader()) - return (TrueVal + (FalseVal / 2)) / FalseVal; - else - return (FalseVal + (TrueVal / 2)) / TrueVal; + return llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight); } bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp index 5d7759056c7d..50752bd78a65 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -18,6 +18,8 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp index fe67e191dc62..1af0ce3d86cc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 8256e3b5f5af..4b9d0dadfc17 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp index cd2c81b6abc8..5ad7aeb463ec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Utils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp index 60bb2775a194..7f961dbaf4b4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/TypeFinder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp index 26d3402bd279..a16ca1fb8efa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FormatVariadic.h" #include <cstdint> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp index 1ef3757017a8..b94f57e4dc2c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -280,3 +281,31 @@ std::string llvm::getUniqueModuleId(Module *M) { MD5::stringifyResult(R, Str); return ("$" + Str).str(); } + +void VFABI::setVectorVariantNames( + CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) { + if (VariantMappings.empty()) + return; + + SmallString<256> Buffer; + llvm::raw_svector_ostream Out(Buffer); + for (const std::string &VariantMapping : VariantMappings) + Out << VariantMapping << ","; + // Get rid of the trailing ','. + assert(!Buffer.str().empty() && "Must have at least one char."); + Buffer.pop_back(); + + Module *M = CI->getModule(); +#ifndef NDEBUG + for (const std::string &VariantMapping : VariantMappings) { + Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping); + assert(VI.hasValue() && "Canno add an invalid VFABI name."); + assert(M->getNamedValue(VI.getValue().VectorName) && + "Cannot add variant to attribute: " + "vector function declaration is missing."); + } +#endif + CI->addAttribute( + AttributeList::FunctionIndex, + Attribute::get(M->getContext(), MappingsAttrName, Buffer.str())); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp index ac8991e9d475..1c5c41abc682 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp @@ -12,9 +12,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/NameAnonGlobals.h" - #include "llvm/ADT/SmallString.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/MD5.h" #include "llvm/Transforms/Utils/ModuleUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 44859eafb9c1..dda2867f44b2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Support/FormattedStream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 0530d3a987a5..0ea6e99e6f19 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" @@ -1403,10 +1404,16 @@ HoistTerminator: // These values do not agree. Insert a select instruction before NT // that determines the right value. SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)]; - if (!SI) + if (!SI) { + // Propagate fast-math-flags from phi node to its replacement select. + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + if (isa<FPMathOperator>(PN)) + Builder.setFastMathFlags(PN.getFastMathFlags()); + SI = cast<SelectInst>( Builder.CreateSelect(BI->getCondition(), BB1V, BB2V, BB1V->getName() + "." + BB2V->getName(), BI)); + } // Make the PHI node use the select for all incoming values for BB1/BB2 for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) @@ -2261,14 +2268,14 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL, if (!BBI->use_empty()) TranslateMap[&*BBI] = N; } - // Insert the new instruction into its new home. - if (N) + if (N) { + // Insert the new instruction into its new home. EdgeBB->getInstList().insert(InsertPt, N); - // Register the new instruction with the assumption cache if necessary. - if (auto *II = dyn_cast_or_null<IntrinsicInst>(N)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); + // Register the new instruction with the assumption cache if necessary. + if (AC && match(N, m_Intrinsic<Intrinsic::assume>())) + AC->registerAssumption(cast<IntrinsicInst>(N)); + } } // Loop over all of the edges from PredBB to BB, changing them to branch @@ -2417,7 +2424,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, if (IfBlock2) hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2); + // Propagate fast-math-flags from phi nodes to replacement selects. + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { + if (isa<FPMathOperator>(PN)) + Builder.setFastMathFlags(PN->getFastMathFlags()); + // Change the PHI node into a select instruction. Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse); Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue); @@ -3211,6 +3223,47 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI, return Changed; } + +/// If the previous block ended with a widenable branch, determine if reusing +/// the target block is profitable and legal. This will have the effect of +/// "widening" PBI, but doesn't require us to reason about hosting safety. +static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { + // TODO: This can be generalized in two important ways: + // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input + // values from the PBI edge. + // 2) We can sink side effecting instructions into BI's fallthrough + // successor provided they doesn't contribute to computation of + // BI's condition. + Value *CondWB, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (!parseWidenableBranch(PBI, CondWB, WC, IfTrueBB, IfFalseBB) || + IfTrueBB != BI->getParent() || !BI->getParent()->getSinglePredecessor()) + return false; + if (!IfFalseBB->phis().empty()) + return false; // TODO + // Use lambda to lazily compute expensive condition after cheap ones. + auto NoSideEffects = [](BasicBlock &BB) { + return !llvm::any_of(BB, [](const Instruction &I) { + return I.mayWriteToMemory() || I.mayHaveSideEffects(); + }); + }; + if (BI->getSuccessor(1) != IfFalseBB && // no inf looping + BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability + NoSideEffects(*BI->getParent())) { + BI->getSuccessor(1)->removePredecessor(BI->getParent()); + BI->setSuccessor(1, IfFalseBB); + return true; + } + if (BI->getSuccessor(0) != IfFalseBB && // no inf looping + BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability + NoSideEffects(*BI->getParent())) { + BI->getSuccessor(0)->removePredecessor(BI->getParent()); + BI->setSuccessor(0, IfFalseBB); + return true; + } + return false; +} + /// If we have a conditional branch as a predecessor of another block, /// this function tries to simplify it. We know /// that PBI and BI are both conditional branches, and BI is in one of the @@ -3266,6 +3319,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, } } + // If the previous block ended with a widenable branch, determine if reusing + // the target block is profitable and legal. This will have the effect of + // "widening" PBI, but doesn't require us to reason about hosting safety. + if (tryWidenCondBranchToCondBranch(PBI, BI)) + return true; + if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition())) if (CE->canTrap()) return false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index 0324993a8203..fa3a9d21f3df 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -288,8 +288,9 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy(CpyDst, 1, Src, 1, - ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1)); + B.CreateMemCpy( + CpyDst, Align::None(), Src, Align::None(), + ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1)); return Dst; } @@ -364,8 +365,8 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p) - return B.CreateGEP(B.getInt8Ty(), SrcStr, emitStrLen(SrcStr, B, DL, TLI), - "strchr"); + if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI)) + return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr"); return nullptr; } @@ -561,7 +562,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. CallInst *NewCI = - B.CreateMemCpy(Dst, 1, Src, 1, + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); NewCI->setAttributes(CI->getAttributes()); return Dst; @@ -589,7 +590,8 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. - CallInst *NewCI = B.CreateMemCpy(Dst, 1, Src, 1, LenV); + CallInst *NewCI = + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), LenV); NewCI->setAttributes(CI->getAttributes()); return DstEnd; } @@ -624,7 +626,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { if (SrcLen == 0) { // strncpy(x, "", y) -> memset(align 1 x, '\0', y) - CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, 1); + CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align::None()); AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0)); NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( CI->getContext(), 0, ArgAttrs)); @@ -637,7 +639,8 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Type *PT = Callee->getFunctionType()->getParamType(0); // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant] - CallInst *NewCI = B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len)); + CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), + ConstantInt::get(DL.getIntPtrType(PT), Len)); NewCI->setAttributes(CI->getAttributes()); return Dst; } @@ -1113,17 +1116,58 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) { return nullptr; // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n) - CallInst *NewCI = - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, Size); + CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), Size); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } +Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) { + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3)); + StringRef SrcStr; + if (CI->use_empty() && Dst == Src) + return Dst; + // memccpy(d, s, c, 0) -> nullptr + if (N) { + if (N->isNullValue()) + return Constant::getNullValue(CI->getType()); + if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0, + /*TrimAtNul=*/false) || + !StopChar) + return nullptr; + } else { + return nullptr; + } + + // Wrap arg 'c' of type int to char + size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF); + if (Pos == StringRef::npos) { + if (N->getZExtValue() <= SrcStr.size()) { + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), + CI->getArgOperand(3)); + return Constant::getNullValue(CI->getType()); + } + return nullptr; + } + + Value *NewN = + ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue())); + // memccpy -> llvm.memcpy + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), NewN); + return Pos + 1 <= N->getZExtValue() + ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN) + : Constant::getNullValue(CI->getType()); +} + Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilder<> &B) { Value *Dst = CI->getArgOperand(0); Value *N = CI->getArgOperand(2); // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n - CallInst *NewCI = B.CreateMemCpy(Dst, 1, CI->getArgOperand(1), 1, N); + CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), CI->getArgOperand(1), + Align::None(), N); NewCI->setAttributes(CI->getAttributes()); return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N); } @@ -1135,8 +1179,8 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { return nullptr; // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n) - CallInst *NewCI = - B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, Size); + CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), Size); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -1196,7 +1240,8 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { // memset(p, v, n) -> llvm.memset(align 1 p, v, n) Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, 1); + CallInst *NewCI = + B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align::None()); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -1599,6 +1644,11 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilder<> &B) { (!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5))) return nullptr; + // Converting pow(X, -0.5) to 1/sqrt(X) may introduce an extra rounding step, + // so that requires fast-math-flags (afn or reassoc). + if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc())) + return nullptr; + Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI); if (!Sqrt) return nullptr; @@ -1696,7 +1746,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { // TODO: This whole transformation should be backend specific (e.g. some // backends might prefer libcalls or the limit for the exponent might // be different) and it should also consider optimizing for size. - APFloat LimF(ExpoF->getSemantics(), 33.0), + APFloat LimF(ExpoF->getSemantics(), 33), ExpoA(abs(*ExpoF)); if (ExpoA.compare(LimF) == APFloat::cmpLessThan) { // This transformation applies to integer or integer+0.5 exponents only. @@ -2426,9 +2476,11 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; // we found a format specifier, bail out. // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1) - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - FormatStr.size() + 1)); // Copy the null byte. + B.CreateMemCpy( + CI->getArgOperand(0), Align::None(), CI->getArgOperand(1), + Align::None(), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + FormatStr.size() + 1)); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -2463,7 +2515,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, IncLen); + B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(2), + Align::None(), IncLen); // The sprintf result is the unincremented number of bytes in the string. return B.CreateIntCast(Len, CI->getType(), false); @@ -2534,7 +2587,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt, // strlen(fmt)+1) B.CreateMemCpy( - CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, + CI->getArgOperand(0), Align::None(), CI->getArgOperand(2), + Align::None(), ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size() + 1)); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); @@ -2575,7 +2629,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { else if (N < Str.size() + 1) return nullptr; - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(3), 1, + B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(3), + Align::None(), ConstantInt::get(CI->getType(), Str.size() + 1)); // The snprintf result is the unincremented number of bytes in the string. @@ -2716,7 +2771,8 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { // Don't rewrite fputs to fwrite when optimising for size because fwrite // requires more arguments and thus extra MOVs are required. bool OptForSize = CI->getFunction()->hasOptSize() || - llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) return nullptr; @@ -2792,7 +2848,8 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) { // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) - return B.CreateMemMove(CI->getArgOperand(1), 1, CI->getArgOperand(0), 1, + return B.CreateMemMove(CI->getArgOperand(1), Align::None(), + CI->getArgOperand(0), Align::None(), CI->getArgOperand(2)); } @@ -2864,6 +2921,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeMemCmp(CI, Builder); case LibFunc_memcpy: return optimizeMemCpy(CI, Builder); + case LibFunc_memccpy: + return optimizeMemCCpy(CI, Builder); case LibFunc_mempcpy: return optimizeMemPCpy(CI, Builder); case LibFunc_memmove: @@ -3223,8 +3282,9 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) { if (isFortifiedCallFoldable(CI, 3, 2)) { - CallInst *NewCI = B.CreateMemCpy( - CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, CI->getArgOperand(2)); + CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), + CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -3234,8 +3294,9 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) { if (isFortifiedCallFoldable(CI, 3, 2)) { - CallInst *NewCI = B.CreateMemMove( - CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, CI->getArgOperand(2)); + CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), + CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -3248,8 +3309,8 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, if (isFortifiedCallFoldable(CI, 3, 2)) { Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - CallInst *NewCI = - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, + CI->getArgOperand(2), Align::None()); NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp index 1519751197d2..d2a400027d4b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp @@ -10,28 +10,80 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SizeOpts.h" + using namespace llvm; -static cl::opt<bool> ProfileGuidedSizeOpt( +cl::opt<bool> EnablePGSO( "pgso", cl::Hidden, cl::init(true), - cl::desc("Enable the profile guided size optimization. ")); - -bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI) { - assert(F); - if (!PSI || !BFI || !PSI->hasProfileSummary()) - return false; - return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI); + cl::desc("Enable the profile guided size optimizations. ")); + +cl::opt<bool> PGSOLargeWorkingSetSizeOnly( + "pgso-lwss-only", cl::Hidden, cl::init(true), + cl::desc("Apply the profile guided size optimizations only " + "if the working set size is large (except for cold code.)")); + +cl::opt<bool> PGSOColdCodeOnly( + "pgso-cold-code-only", cl::Hidden, cl::init(true), + cl::desc("Apply the profile guided size optimizations only " + "to cold code.")); + +cl::opt<bool> PGSOIRPassOrTestOnly( + "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false), + cl::desc("Apply the profile guided size optimizations only" + "to the IR passes or tests.")); + +cl::opt<bool> ForcePGSO( + "force-pgso", cl::Hidden, cl::init(false), + cl::desc("Force the (profiled-guided) size optimizations. ")); + +cl::opt<int> PgsoCutoffInstrProf( + "pgso-cutoff-instr-prof", cl::Hidden, cl::init(250000), cl::ZeroOrMore, + cl::desc("The profile guided size optimization profile summary cutoff " + "for instrumentation profile.")); + +cl::opt<int> PgsoCutoffSampleProf( + "pgso-cutoff-sample-prof", cl::Hidden, cl::init(800000), cl::ZeroOrMore, + cl::desc("The profile guided size optimization profile summary cutoff " + "for sample profile.")); + +namespace { +struct BasicBlockBFIAdapter { + static bool isFunctionColdInCallGraph(const Function *F, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo &BFI) { + return PSI->isFunctionColdInCallGraph(F, BFI); + } + static bool isFunctionHotInCallGraphNthPercentile(int CutOff, + const Function *F, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo &BFI) { + return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI); + } + static bool isColdBlock(const BasicBlock *BB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { + return PSI->isColdBlock(BB, BFI); + } + static bool isHotBlockNthPercentile(int CutOff, + const BasicBlock *BB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { + return PSI->isHotBlockNthPercentile(CutOff, BB, BFI); + } +}; +} // end anonymous namespace + +bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, + PGSOQueryType QueryType) { + return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI, + QueryType); } -bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI) { - assert(BB); - if (!PSI || !BFI || !PSI->hasProfileSummary()) - return false; - return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI); +bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, + PGSOQueryType QueryType) { + return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI, + QueryType); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp index 50844cf9d1c5..7880ea1c6c47 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Statepoint.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp index 97a4533fabe5..21cbbfb140b6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/DebugInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index 5d380dcf231c..aacf81d83519 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -69,6 +69,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 7f7bdf8a3d6d..9af39d9a0dd1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -18,10 +18,16 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Utils.h" using namespace llvm; char UnifyFunctionExitNodes::ID = 0; + +UnifyFunctionExitNodes::UnifyFunctionExitNodes() : FunctionPass(ID) { + initializeUnifyFunctionExitNodesPass(*PassRegistry::getPassRegistry()); +} + INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn", "Unify function exit nodes", false, false) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp index 5272ab6e95d5..7769c7493cda 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp @@ -39,6 +39,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) { initializeMetaRenamerPass(Registry); initializeStripGCRelocatesPass(Registry); initializePredicateInfoPrinterLegacyPassPass(Registry); + initializeInjectTLIMappingsLegacyPass(Registry); } /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index f44976c723ec..7478daa2a0a5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -38,6 +38,7 @@ // could use this pass (with some modifications), but currently it implements // its own pass to do something similar to what we do here. +#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/MapVector.h" @@ -52,7 +53,6 @@ #include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" @@ -71,14 +71,15 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" -#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include <algorithm> #include <cassert> #include <cstdlib> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index f43842be5357..3f943f4c0688 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -72,7 +72,7 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), IsVectorized("isvectorized", 0, HK_ISVECTORIZED), - Predicate("vectorize.predicate.enable", 0, HK_PREDICATE), TheLoop(L), + Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L), ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -815,6 +815,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } } + // For first order recurrences, we use the previous value (incoming value from + // the latch) to check if it dominates all users of the recurrence. Bail out + // if we have to sink such an instruction for another recurrence, as the + // dominance requirement may not hold after sinking. + BasicBlock *LoopLatch = TheLoop->getLoopLatch(); + if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { + Instruction *V = + cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch)); + return SinkAfter.find(V) != SinkAfter.end(); + })) + return false; + // Now we know the widest induction type, check if our found induction // is the same size. If it's not, unset it here and InnerLoopVectorizer // will create another. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index a5e85f27fabf..c3ca43fcd492 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -201,6 +201,9 @@ class LoopVectorizationPlanner { /// The profitability analysis. LoopVectorizationCostModel &CM; + /// The interleaved access analysis. + InterleavedAccessInfo &IAI; + SmallVector<VPlanPtr, 4> VPlans; /// This class is used to enable the VPlan to invoke a method of ILV. This is @@ -211,6 +214,8 @@ class LoopVectorizationPlanner { VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} Value *getOrCreateVectorValues(Value *V, unsigned Part) override; + Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) override; }; /// A builder used to construct the current plan. @@ -223,8 +228,10 @@ public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} + LoopVectorizationCostModel &CM, + InterleavedAccessInfo &IAI) + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + IAI(IAI) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 8f0bf70f873c..684a3098e564 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,8 +58,8 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanHCFGBuilder.h" -#include "VPlanHCFGTransforms.h" #include "VPlanPredicator.h" +#include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -124,6 +124,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -149,7 +150,6 @@ #include <string> #include <tuple> #include <utility> -#include <vector> using namespace llvm; @@ -200,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); -/// We don't interleave loops with a known constant trip count below this -/// number. -static const unsigned TinyTripCountInterleaveThreshold = 128; +static cl::opt<unsigned> TinyTripCountInterleaveThreshold( + "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, + cl::desc("We don't interleave loops with a estimated constant trip count " + "below this number")); static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -427,6 +428,11 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; + /// Vectorize a single GetElementPtrInst based on information gathered and + /// decisions taken during planning. + void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -476,15 +482,20 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to, - /// optionally masking the vector operations if \p BlockInMask is non-null. - void vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask = nullptr); - - /// Vectorize Load and Store instructions, optionally masking the vector - /// operations if \p BlockInMask is non-null. - void vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask = nullptr); + /// Try to vectorize the interleaved access group that \p Instr belongs to + /// with the base address given in \p Addr, optionally masking the vector + /// operations if \p BlockInMask is non-null. Use \p State to translate given + /// VPValues to IR values in the vectorized loop. + void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, + VPValue *Addr, VPValue *BlockInMask = nullptr); + + /// Vectorize Load and Store instructions with the base address given in \p + /// Addr, optionally masking the vector operations if \p BlockInMask is + /// non-null. Use \p State to translate given VPValues to IR values in the + /// vectorized loop. + void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask = nullptr); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -525,6 +536,9 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); + /// Clear NSW/NUW flags from reduction instructions if necessary. + void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); + /// The Loop exit block may have single value PHI nodes with some /// incoming value. While vectorizing we only handled real values /// that were defined inside the loop and we should have one value for @@ -539,10 +553,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(); - /// Insert the new loop to the loop hierarchy and pass manager - /// and update the analysis passes. - void updateAnalysis(); - /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -1204,14 +1214,14 @@ public: /// Returns true if the target machine supports masked scatter operation /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType) { - return TTI.isLegalMaskedScatter(DataType); + bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedScatter(DataType, Alignment); } /// Returns true if the target machine supports masked gather operation /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType) { - return TTI.isLegalMaskedGather(DataType); + bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedGather(DataType, Alignment); } /// Returns true if the target machine can represent \p V as a masked gather @@ -1222,7 +1232,9 @@ public: if (!LI && !SI) return false; auto *Ty = getMemInstValueType(V); - return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + MaybeAlign Align = getLoadStoreAlignment(V); + return (LI && isLegalMaskedGather(Ty, Align)) || + (SI && isLegalMaskedScatter(Ty, Align)); } /// Returns true if \p I is an instruction that will be scalarized with @@ -2155,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { const InterleaveGroup<Instruction> *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -2165,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, return; const DataLayout &DL = Instr->getModule()->getDataLayout(); - Value *Ptr = getLoadStorePointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); // Prepare for the new pointers. - setDebugLocFromInst(Builder, Ptr); - SmallVector<Value *, 2> NewPtrs; + SmallVector<Value *, 2> AddrParts; unsigned Index = Group->getIndex(Instr); - VectorParts Mask; - bool IsMaskForCondRequired = BlockInMask; - if (IsMaskForCondRequired) { - Mask = *BlockInMask; - // TODO: extend the masked interleaved-group support to reversed access. - assert(!Group->isReverse() && "Reversed masked interleave-group " - "not supported."); - } + // TODO: extend the masked interleaved-group support to reversed access. + assert((!BlockInMask || !Group->isReverse()) && + "Reversed masked interleave-group not supported."); // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, @@ -2196,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); + Value *AddrPart = State.get(Addr, {Part, 0}); + setDebugLocFromInst(Builder, AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2214,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, // A[i] = b; // Member of index 0 // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. - NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); - if (InBounds) - cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) + InBounds = gep->isInBounds(); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); + cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); // Cast to the vector pointer type. - NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); + Type *PtrTy = VecTy->getPointerTo(AddressSpace); + AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } setDebugLocFromInst(Builder, Instr); @@ -2237,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoad; - if (IsMaskForCondRequired || MaskForGaps) { + if (BlockInMask || MaskForGaps) { assert(useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."); Value *GroupMask = MaskForGaps; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; } NewLoad = - Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), GroupMask, UndefVec, "wide.masked.vec"); } else - NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], + NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); @@ -2325,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, "interleaved.vec"); Instruction *NewStoreInstr; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( - IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); } else - NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], - Group->getAlignment()); + NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } } void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2354,17 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point"); if (Decision == LoopVectorizationCostModel::CM_Interleave) - return vectorizeInterleaveGroup(Instr); + return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getLoadStorePointerOperand(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); const Align Alignment = DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); - unsigned AddressSpace = getLoadStoreAddressSpace(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2378,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // gather/scatter. Otherwise Decision should have been to Scalarize. assert((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"); + (void)ConsecutiveStride; - // Handle consecutive loads/stores. - if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, {0, 0}); - - VectorParts Mask; + VectorParts BlockInMaskParts(UF); bool isMaskRequired = BlockInMask; if (isMaskRequired) - Mask = *BlockInMask; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>( - getLoadStorePointerOperand(Instr)->stripPointerCasts())) - InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; ++Part) + BlockInMaskParts[Part] = State.get(BlockInMask, Part); const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. GetElementPtrInst *PartPtr = nullptr; + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. @@ -2407,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); + BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { PartPtr = cast<GetElementPtrInst>( Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); PartPtr->setIsInBounds(InBounds); } + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); }; @@ -2425,8 +2433,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Instruction *NewSI = nullptr; Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment.value(), MaskPart); } else { @@ -2437,10 +2445,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, - Alignment.value(), Mask[Part]); + NewSI = Builder.CreateMaskedStore( + StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); @@ -2456,17 +2464,17 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment.value(), Mask[Part], - UndefValue::get(DataTy), - "wide.masked.load"); + NewLI = Builder.CreateMaskedLoad( + VecPtr, Alignment.value(), BlockInMaskParts[Part], + UndefValue::get(DataTy), "wide.masked.load"); else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), "wide.load"); @@ -2676,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the @@ -2694,48 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + LoopBypassBlocks.push_back(TCCheckBlock); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for SCEV checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = - Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + Value *SCEVCheck = Exp.expandCodeForPredicate( + &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) if (C->isZero()) return; - assert(!BB->getParent()->hasOptSize() && + assert(!SCEVCheckBlock->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); - // Create a new block containing the stride check. - BB->setName("vector.scevcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, SCEVCheck)); - LoopBypassBlocks.push_back(BB); + SCEVCheckBlock->setName("vector.scevcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, + nullptr, "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, SCEVCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); + } + + ReplaceInstWithInst( + SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } @@ -2744,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (EnableVPlanNativePath) return; - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for runtime memory checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const MemCheckBlock = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements @@ -2752,11 +2777,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); if (!MemRuntimeCheck) return; - if (BB->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2770,24 +2795,28 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { }); } - // Create a new block containing the memory check. - BB->setName("vector.memcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); - LoopBypassBlocks.push_back(BB); + MemCheckBlock->setName("vector.memcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, MemCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); + } + + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2912,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { ... */ - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer @@ -2934,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + LoopScalarBody = OrigLoop->getHeader(); + LoopVectorPreHeader = OrigLoop->getLoopPreheader(); + LoopExitBlock = OrigLoop->getExitBlock(); + assert(LoopExitBlock && "Must have an exit block"); + assert(LoopVectorPreHeader && "Invalid loop structure"); + + LoopMiddleBlock = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "middle.block"); + LoopScalarPreHeader = + SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, + nullptr, "scalar.ph"); + // We intentionally don't let SplitBlock to update LoopInfo since + // LoopVectorBody should belong to another loop than LoopVectorPreHeader. + // LoopVectorBody is explicitly added to the correct place few lines later. + LoopVectorBody = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + nullptr, nullptr, "vector.body"); + + // Update dominator for loop exit. + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -2949,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, *LI); + Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); @@ -2966,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, ScalarPH); + emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) @@ -3003,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create( - OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; @@ -3015,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, MiddleBlock); + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help @@ -3049,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with @@ -3060,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } - BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = Lp->getLoopPreheader(); - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; - LoopScalarBody = OldBasicBlock; + assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + "Inconsistent vector loop preheader"); + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, @@ -3094,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); + LI->verify(*DT); +#endif + return LoopVectorPreHeader; } @@ -3429,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // This is the second stage of vectorizing recurrences. fixCrossIterationPHIs(); - // Update the dominator tree. - // - // FIXME: After creating the structure of the new loop, the dominator tree is - // no longer up-to-date, and it remains that way until we update it - // here. An out-of-date dominator tree is problematic for SCEV, - // because SCEVExpander uses it to guide code generation. The - // vectorizer use SCEVExpanders in several places. Instead, we should - // keep the dominator tree up-to-date as we go. - updateAnalysis(); + // Forget the original basic block. + PSE.getSE()->forgetLoop(OrigLoop); // Fix-up external users of the induction variables. for (auto &Entry : *Legal->getInductionVars()) @@ -3550,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // among all unrolled iterations, due to the order of their construction. Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); - // Set the insertion point after the previous value if it is an instruction. + // Find and set the insertion point after the previous value if it is an + // instruction. + BasicBlock::iterator InsertPt; // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. Also, if the previous - // value is a phi node, we should insert after all the phi nodes to avoid - // breaking basic block verification. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || - isa<PHINode>(PreviousLastPart)) - Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - else - Builder.SetInsertPoint( - &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); + // guaranteed to be an instruction in the vector loop. + // FIXME: Loop invariant values do not form recurrences. We should deal with + // them earlier. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) + InsertPt = LoopVectorBody->getFirstInsertionPt(); + else { + Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); + if (isa<PHINode>(PreviousLastPart)) + // If the previous value is a phi node, we should insert after all the phi + // nodes in the block containing the PHI to avoid breaking basic block + // verification. Note that the basic block may be different to + // LoopVectorBody, in case we predicate the loop. + InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); + else + InsertPt = ++PreviousInst->getIterator(); + } + Builder.SetInsertPoint(&*InsertPt); // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. @@ -3693,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } } + // Wrap flags are in general invalid after vectorization, clear them. + clearReductionWrapFlags(RdxDesc); + // Fix the vector-loop phi. // Reductions do not have to start at zero. They can start with // any loop invariant values. BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + for (unsigned Part = 0; Part < UF; ++Part) { Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); Value *Val = getOrCreateVectorValue(LoopVal, Part); - // Make sure to add the reduction stat value only to the + // Make sure to add the reduction start value only to the // first unroll part. Value *StartVal = (Part == 0) ? VectorStart : Identity; cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); @@ -3839,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } +void InnerLoopVectorizer::clearReductionWrapFlags( + RecurrenceDescriptor &RdxDesc) { + RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurrenceDescriptor::RK_IntegerAdd && + RK != RecurrenceDescriptor::RK_IntegerMult) + return; + + Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); + assert(LoopExitInstr && "null loop exit instruction"); + SmallVector<Instruction *, 8> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + Worklist.push_back(LoopExitInstr); + Visited.insert(LoopExitInstr); + + while (!Worklist.empty()) { + Instruction *Cur = Worklist.pop_back_val(); + if (isa<OverflowingBinaryOperator>(Cur)) + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = getOrCreateVectorValue(Cur, Part); + cast<Instruction>(V)->dropPoisonGeneratingFlags(); + } + + for (User *U : Cur->users()) { + Instruction *UI = cast<Instruction>(U); + if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && + Visited.insert(UI).second) + Worklist.push_back(UI); + } + } +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { @@ -3960,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant) { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); + VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? GEP->getPointerOperand() + : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (auto Index : enumerate(GEP->indices())) { + Value *User = Index.value().get(); + if (IsIndexLoopInvariant[Index.index()]) + Indices.push_back(User); + else + Indices.push_back(getOrCreateVectorValue(User, Part)); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, + Indices) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + addMetadata(NewGEP, GEP); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { PHINode *P = cast<PHINode>(PN); @@ -4062,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: case Instruction::PHI: + case Instruction::GetElementPtr: llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::GetElementPtr: { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - auto *GEP = cast<GetElementPtrInst>(&I); - - if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = - OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { - if (OrigLoop->isLoopInvariant(U.get())) - Indices.push_back(U.get()); - else - Indices.push_back(getOrCreateVectorValue(U.get(), Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); - addMetadata(NewGEP, GEP); - } - } - - break; - } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -4335,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { } // end of switch. } -void InnerLoopVectorizer::updateAnalysis() { - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - - // DT is not kept up-to-date for outer loop vectorization - if (EnableVPlanNativePath) - return; - - // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); - - DT->addNewBlock(LoopMiddleBlock, - LI->getLoopFor(LoopVectorBody)->getLoopLatch()); - DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); - DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); -} - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -4562,9 +4619,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne return WideningDecision == CM_Scalarize; } const MaybeAlign Alignment = getLoadStoreAlignment(I); - return isa<LoadInst>(I) ? - !(isLegalMaskedLoad(Ty, Ptr, Alignment) || isLegalMaskedGather(Ty)) - : !(isLegalMaskedStore(Ty, Ptr, Alignment) || isLegalMaskedScatter(Ty)); + return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || + isLegalMaskedGather(Ty, Alignment)) + : !(isLegalMaskedStore(Ty, Ptr, Alignment) || + isLegalMaskedScatter(Ty, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -4667,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { SetVector<Instruction *> Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); + // Instructions that are scalar with predication must not be considered + // uniform after vectorization, because that would create an erroneous + // replicating region where only a single instance out of VF should be formed. + // TODO: optimize such seldom cases if found important, see PR40816. + auto addToWorklistIfAllowed = [&](Instruction *I) -> void { + if (isScalarWithPredication(I, VF)) { + LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " + << *I << "\n"); + return; + } + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); + Worklist.insert(I); + }; + // Start with the conditional branch. If the branch condition is an // instruction contained in the loop that is only used by the branch, it is // uniform. auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { - Worklist.insert(Cmp); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); - } + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + addToWorklistIfAllowed(Cmp); // Holds consecutive and consecutive-like pointers. Consecutive-like pointers // are pointers that are treated like consecutive pointers during @@ -4733,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); - Worklist.insert(V); - } + if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) + addToWorklistIfAllowed(V); // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -4762,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { return Worklist.count(J) || (OI == getLoadStorePointerOperand(J) && isUniformDecision(J, VF)); - })) { - Worklist.insert(OI); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); - } + })) + addToWorklistIfAllowed(OI); } } @@ -4807,11 +4873,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { continue; // The induction variable and its update instruction will remain uniform. - Worklist.insert(Ind); - Worklist.insert(IndUpdate); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate - << "\n"); + addToWorklistIfAllowed(Ind); + addToWorklistIfAllowed(IndUpdate); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); @@ -5143,9 +5206,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small trip count. - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (TC > 1 && TC < TinyTripCountInterleaveThreshold) + // Do not interleave loops with a relatively small known or estimated trip + // count. + auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) return 1; RegisterUsage R = calculateRegisterUsage({VF})[0]; @@ -5208,12 +5272,10 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - // If the trip count is constant, limit the interleave count to be less than - // the trip count divided by VF. - if (TC > 0) { - assert(TC >= VF && "VF exceeds trip count?"); - if ((TC / VF) < MaxInterleaveCount) - MaxInterleaveCount = (TC / VF); + // If trip count is known or estimated compile time constant, limit the + // interleave count to be less than the trip count divided by VF. + if (BestKnownTC) { + MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); } // If we did not calculate the cost for VF (because the user selected the VF) @@ -5746,7 +5808,7 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // vectorized loop where the user of it is a vectorized instruction. const MaybeAlign Alignment = getLoadStoreAlignment(I); Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), - Alignment ? Alignment->value() : 0, AS); + Alignment, AS); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5783,8 +5845,7 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment ? Alignment->value() : 0, AS); else - Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, - Alignment ? Alignment->value() : 0, AS, I); + Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -5800,16 +5861,14 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Load, ValTy, - Alignment ? Alignment->value() : 0, AS) + + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); } StoreInst *SI = cast<StoreInst>(I); bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(Instruction::Store, ValTy, - Alignment ? Alignment->value() : 0, AS) + + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, @@ -5877,8 +5936,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + - TTI.getMemoryOpCost(I->getOpcode(), ValTy, - Alignment ? Alignment->value() : 0, AS, I); + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I); } return getWideningCost(I, VF); } @@ -6217,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; @@ -6225,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, - I->getOperand(0)); + I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -6714,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range, - VPlanPtr &Plan) { - const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); - if (!IG) - return nullptr; - - // Now check if IG is relevant for VF's in the given range. - auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { - return [=](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Interleave); - }; - }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) - return nullptr; - - // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) - // range. If it's the primary member of the IG construct a VPInterleaveRecipe. - // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. - assert(I == IG->getInsertPos() && - "Generating a recipe for an adjunct member of an interleave group"); - - VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), Plan); - - return new VPInterleaveRecipe(IG, Mask); -} - VPWidenMemoryInstructionRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan) { @@ -6754,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, auto willWiden = [&](unsigned VF) -> bool { if (VF == 1) return false; - if (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF)) - return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point."); - assert(Decision != LoopVectorizationCostModel::CM_Interleave && - "Interleave memory opportunity should be caught earlier."); + if (Decision == LoopVectorizationCostModel::CM_Interleave) + return true; + if (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF)) + return false; return Decision != LoopVectorizationCostModel::CM_Scalarize; }; @@ -6773,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), Plan); - return new VPWidenMemoryInstructionRecipe(*I, Mask); + VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); } VPWidenIntOrFpInductionRecipe * @@ -6861,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: - case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::IntToPtr: case Instruction::Load: @@ -6926,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; + // If this ingredient's recipe is to be recorded, keep its recipe a singleton + // to avoid having to split recipes later. + bool IsSingleton = Ingredient2Recipe.count(I); + + // Success: widen this instruction. - // Success: widen this instruction. We optimize the common case where + // Use the default widening recipe. We optimize the common case where // consecutive instructions can be represented by a single recipe. - if (!VPBB->empty()) { - VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); - if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) - return true; - } + if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && + LastExtensibleRecipe->appendInstruction(I)) + return true; - VPBB->appendRecipe(new VPWidenRecipe(I)); + VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); + if (!IsSingleton) + LastExtensibleRecipe = WidenRecipe; + setRecipe(I, WidenRecipe); + VPBB->appendRecipe(WidenRecipe); return true; } @@ -6951,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + setRecipe(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -7009,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB) { VPRecipeBase *Recipe = nullptr; - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { - VPBB->appendRecipe(Recipe); - return true; - } - // Check if Instr is a memory operation that should be widened. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { + // First, check for specific widening recipes that deal with memory + // operations, inductions and Phi nodes. + if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || + (Recipe = tryToOptimizeInduction(Instr, Range)) || + (Recipe = tryToBlend(Instr, Plan)) || + (isa<PHINode>(Instr) && + (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - return true; - } - if ((Recipe = tryToBlend(Instr, Plan))) { + // Handle GEP widening. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { + auto Scalarize = [&](unsigned VF) { + return CM.isScalarWithPredication(Instr, VF) || + CM.isScalarAfterVectorization(Instr, VF) || + CM.isProfitableToScalarize(Instr, VF); + }; + if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) + return false; + VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - return true; - } // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. + // having first checked for specific widening recipes. if (tryToWiden(Instr, VPBB, Range)) return true; @@ -7094,19 +7129,57 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, SmallPtrSetImpl<Instruction *> &DeadInstructions) { + // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - DenseMap<Instruction *, Instruction *> SinkAfterInverse; + + SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; + + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); + + // --------------------------------------------------------------------------- + // Pre-construction: record ingredients whose recipes we'll need to further + // process after constructing the initial VPlan. + // --------------------------------------------------------------------------- + + // Mark instructions we'll need to sink later and their targets as + // ingredients whose recipe we'll need to record. + for (auto &Entry : SinkAfter) { + RecipeBuilder.recordRecipeOf(Entry.first); + RecipeBuilder.recordRecipeOf(Entry.second); + } + + // For each interleave group which is relevant for this (possibly trimmed) + // Range, add it to the set of groups to be later applied to the VPlan and add + // placeholders for its members' Recipes which we'll be replacing with a + // single VPInterleaveRecipe. + for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { + auto applyIG = [IG, this](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + if (!getDecisionAndClampRange(applyIG, Range)) + continue; + InterleaveGroups.insert(IG); + for (unsigned i = 0; i < IG->getFactor(); i++) + if (Instruction *Member = IG->getMember(i)) + RecipeBuilder.recordRecipeOf(Member); + }; + + // --------------------------------------------------------------------------- + // Build initial VPlan: Scan the body of the loop in a topological order to + // visit each basic block after having visited its predecessor basic blocks. + // --------------------------------------------------------------------------- // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = std::make_unique<VPlan>(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); @@ -7125,10 +7198,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); - std::vector<Instruction *> Ingredients; - - // Organize the ingredients to vectorize from current basic block in the - // right order. + // Introduce each ingredient into VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; @@ -7138,43 +7208,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( DeadInstructions.find(Instr) != DeadInstructions.end()) continue; - // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct - // member of the IG, do not construct any Recipe for it. - const InterleaveGroup<Instruction> *IG = - CM.getInterleavedAccessGroup(Instr); - if (IG && Instr != IG->getInsertPos() && - Range.Start >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(Instr, Range.Start) == - LoopVectorizationCostModel::CM_Interleave) { - auto SinkCandidate = SinkAfterInverse.find(Instr); - if (SinkCandidate != SinkAfterInverse.end()) - Ingredients.push_back(SinkCandidate->second); - continue; - } - - // Move instructions to handle first-order recurrences, step 1: avoid - // handling this instruction until after we've handled the instruction it - // should follow. - auto SAIt = SinkAfter.find(Instr); - if (SAIt != SinkAfter.end()) { - LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" - << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); - SinkAfterInverse[SAIt->second] = Instr; - continue; - } - - Ingredients.push_back(Instr); - - // Move instructions to handle first-order recurrences, step 2: push the - // instruction to be sunk at its insertion point. - auto SAInvIt = SinkAfterInverse.find(Instr); - if (SAInvIt != SinkAfterInverse.end()) - Ingredients.push_back(SAInvIt->second); - } - - // Introduce each ingredient into VPlan. - for (Instruction *Instr : Ingredients) { if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) continue; @@ -7199,6 +7232,33 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // --------------------------------------------------------------------------- + // Transform initial VPlan: Apply previously taken decisions, in order, to + // bring the VPlan to its final state. + // --------------------------------------------------------------------------- + + // Apply Sink-After legal constraints. + for (auto &Entry : SinkAfter) { + VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); + VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + Sink->moveAfter(Target); + } + + // Interleave memory: for each Interleave Group we marked earlier as relevant + // for this VPlan, replace the Recipes widening its memory instructions with a + // single VPInterleaveRecipe at its insertion point. + for (auto IG : InterleaveGroups) { + auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( + RecipeBuilder.getRecipe(IG->getInsertPos())); + (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) + ->insertBefore(Recipe); + + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *Member = IG->getMember(i)) { + RecipeBuilder.getRecipe(Member)->eraseFromParent(); + } + } + // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. if (CM.foldTailByMasking()) { @@ -7255,9 +7315,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes( - Plan, Legal->getInductionVars(), DeadInstructions); - + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); return Plan; } @@ -7266,13 +7325,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) { return ILV.getOrCreateVectorValue(V, Part); } +Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( + Value *V, const VPIteration &Instance) { + return ILV.getOrCreateScalarValue(V, Instance); +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) @@ -7286,6 +7353,11 @@ void VPWidenRecipe::execute(VPTransformState &State) { State.ILV->widenInstruction(Instr); } +void VPWidenGEPRecipe::execute(VPTransformState &State) { + State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant); +} + void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, Trunc); @@ -7336,15 +7408,8 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - if (!User) - return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), + getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7431,29 +7496,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - if (!User) - return State.ILV->vectorizeMemoryInstruction(&Instr); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); -} - -static ScalarEpilogueLowering -getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, - ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { - ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed; - if (Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI))) - SEL = CM_ScalarEpilogueNotAllowedOptSize; - else if (PreferPredicateOverEpilog || Hints.getPredicate()) - SEL = CM_ScalarEpilogueNotNeededUsePredicate; - - return SEL; + State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); +} + +// Determine how to lower the scalar epilogue, which depends on 1) optimising +// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing +// predication, and 4) a TTI hook that analyses whether the loop is suitable +// for predication. +static ScalarEpilogueLowering getScalarEpilogueLowering( + Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + LoopVectorizationLegality &LVL) { + bool OptSize = + F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + // 1) OptSize takes precedence over all other options, i.e. if this is set, + // don't look at hints or options, and don't request a scalar epilogue. + if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) + return CM_ScalarEpilogueNotAllowedOptSize; + + bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && + !PreferPredicateOverEpilog; + + // 2) Next, if disabling predication is requested on the command line, honour + // this and request a scalar epilogue. Also do this if we don't have a + // primary induction variable, which is required for predication. + if (PredicateOptDisabled || !LVL.getPrimaryInduction()) + return CM_ScalarEpilogueAllowed; + + // 3) and 4) look if enabling predication is requested on the command line, + // with a loop hint, or if the TTI hook indicates this is profitable, request + // predication . + if (PreferPredicateOverEpilog || + Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || + (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, + LVL.getLAI()) && + Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) + return CM_ScalarEpilogueNotNeededUsePredicate; + + return CM_ScalarEpilogueAllowed; } // Process the loop in the VPlan-native vectorization path. This path builds @@ -7470,14 +7552,16 @@ static bool processLoopInVPlanNativePath( assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); @@ -7562,7 +7646,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7635,7 +7720,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 974eff9974d9..aabd974cd73e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -72,6 +73,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -127,6 +129,10 @@ static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt<int> +MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, + cl::desc("Maximum depth of the lookup for consecutive stores.")); + /// Limits the size of scheduling regions in a block. /// It avoid long compile times for _very_ large blocks where vector /// instructions are spread over a wide range. @@ -147,6 +153,20 @@ static cl::opt<unsigned> MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +// The maximum depth that the look-ahead score heuristic will explore. +// The higher this value, the higher the compilation time overhead. +static cl::opt<int> LookAheadMaxDepth( + "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for operand reordering scores")); + +// The Look-ahead heuristic goes through the users of the bundle to calculate +// the users cost in getExternalUsesCost(). To avoid compilation time increase +// we limit the number of users visited to this value. +static cl::opt<unsigned> LookAheadUsersBudget( + "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, + cl::desc("The maximum number of users to visit while visiting the " + "predecessors. This prevents compilation time increase.")); + static cl::opt<bool> ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -547,7 +567,7 @@ public: /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking - /// into account (anf updating it, if required) list of externally used + /// into account (and updating it, if required) list of externally used /// values stored in \p ExternallyUsedValues. void buildTree(ArrayRef<Value *> Roots, ExtraValueToDebugLocsMap &ExternallyUsedValues, @@ -609,7 +629,10 @@ public: return MinVecRegSize; } - /// Check if ArrayType or StructType is isomorphic to some VectorType. + /// Check if homogeneous aggregate is isomorphic to some VectorType. + /// Accepts homogeneous multidimensional aggregate of scalars/vectors like + /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, + /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; @@ -721,6 +744,7 @@ public: const DataLayout &DL; ScalarEvolution &SE; + const BoUpSLP &R; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -746,6 +770,227 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } + // The hard-coded scores listed here are not very important. When computing + // the scores of matching one sub-tree with another, we are basically + // counting the number of values that are matching. So even if all scores + // are set to 1, we would still get a decent matching result. + // However, sometimes we have to break ties. For example we may have to + // choose between matching loads vs matching opcodes. This is what these + // scores are helping us with: they provide the order of preference. + + /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). + static const int ScoreConsecutiveLoads = 3; + /// ExtractElementInst from same vector and consecutive indexes. + static const int ScoreConsecutiveExtracts = 3; + /// Constants. + static const int ScoreConstants = 2; + /// Instructions with the same opcode. + static const int ScoreSameOpcode = 2; + /// Instructions with alt opcodes (e.g, add + sub). + static const int ScoreAltOpcodes = 1; + /// Identical instructions (a.k.a. splat or broadcast). + static const int ScoreSplat = 1; + /// Matching with an undef is preferable to failing. + static const int ScoreUndef = 1; + /// Score for failing to find a decent match. + static const int ScoreFail = 0; + /// User exteranl to the vectorized code. + static const int ExternalUseCost = 1; + /// The user is internal but in a different lane. + static const int UserInDiffLaneCost = ExternalUseCost; + + /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. + static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, + ScalarEvolution &SE) { + auto *LI1 = dyn_cast<LoadInst>(V1); + auto *LI2 = dyn_cast<LoadInst>(V2); + if (LI1 && LI2) + return isConsecutiveAccess(LI1, LI2, DL, SE) + ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + + auto *C1 = dyn_cast<Constant>(V1); + auto *C2 = dyn_cast<Constant>(V2); + if (C1 && C2) + return VLOperands::ScoreConstants; + + // Extracts from consecutive indexes of the same vector better score as + // the extracts could be optimized away. + auto *Ex1 = dyn_cast<ExtractElementInst>(V1); + auto *Ex2 = dyn_cast<ExtractElementInst>(V2); + if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() && + cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 == + cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) { + return VLOperands::ScoreConsecutiveExtracts; + } + + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (I1 && I2) { + if (I1 == I2) + return VLOperands::ScoreSplat; + InstructionsState S = getSameOpcode({I1, I2}); + // Note: Only consider instructions with <= 2 operands to avoid + // complexity explosion. + if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) + return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes + : VLOperands::ScoreSameOpcode; + } + + if (isa<UndefValue>(V2)) + return VLOperands::ScoreUndef; + + return VLOperands::ScoreFail; + } + + /// Holds the values and their lane that are taking part in the look-ahead + /// score calculation. This is used in the external uses cost calculation. + SmallDenseMap<Value *, int> InLookAheadValues; + + /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// either external to the vectorized code, or require shuffling. + int getExternalUsesCost(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + int Cost = 0; + SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS}; + for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { + Value *V = Values[Idx].first; + // Calculate the absolute lane, using the minimum relative lane of LHS + // and RHS as base and Idx as the offset. + int Ln = std::min(LHS.second, RHS.second) + Idx; + assert(Ln >= 0 && "Bad lane calculation"); + unsigned UsersBudget = LookAheadUsersBudget; + for (User *U : V->users()) { + if (const TreeEntry *UserTE = R.getTreeEntry(U)) { + // The user is in the VectorizableTree. Check if we need to insert. + auto It = llvm::find(UserTE->Scalars, U); + assert(It != UserTE->Scalars.end() && "U is in UserTE"); + int UserLn = std::distance(UserTE->Scalars.begin(), It); + assert(UserLn >= 0 && "Bad lane"); + if (UserLn != Ln) + Cost += UserInDiffLaneCost; + } else { + // Check if the user is in the look-ahead code. + auto It2 = InLookAheadValues.find(U); + if (It2 != InLookAheadValues.end()) { + // The user is in the look-ahead code. Check the lane. + if (It2->second != Ln) + Cost += UserInDiffLaneCost; + } else { + // The user is neither in SLP tree nor in the look-ahead code. + Cost += ExternalUseCost; + } + } + // Limit the number of visited uses to cap compilation time. + if (--UsersBudget == 0) + break; + } + } + return Cost; + } + + /// Go through the operands of \p LHS and \p RHS recursively until \p + /// MaxLevel, and return the cummulative score. For example: + /// \verbatim + /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] + /// \ / \ / \ / \ / + /// + + + + + /// G1 G2 G3 G4 + /// \endverbatim + /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at + /// each level recursively, accumulating the score. It starts from matching + /// the additions at level 0, then moves on to the loads (level 1). The + /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and + /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// Please note that the order of the operands does not matter, as we + /// evaluate the score of all profitable combinations of operands. In + /// other words the score of G1 and G4 is the same as G1 and G2. This + /// heuristic is based on ideas described in: + /// Look-ahead SLP: Auto-vectorization in the presence of commutative + /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, + /// LuÃs F. W. Góes + int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS, int CurrLevel, + int MaxLevel) { + + Value *V1 = LHS.first; + Value *V2 = RHS.first; + // Get the shallow score of V1 and V2. + int ShallowScoreAtThisLevel = + std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - + getExternalUsesCost(LHS, RHS)); + int Lane1 = LHS.second; + int Lane2 = RHS.second; + + // If reached MaxLevel, + // or if V1 and V2 are not instructions, + // or if they are SPLAT, + // or if they are not consecutive, early return the current cost. + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || + ShallowScoreAtThisLevel == VLOperands::ScoreFail || + (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + return ShallowScoreAtThisLevel; + assert(I1 && I2 && "Should have early exited."); + + // Keep track of in-tree values for determining the external-use cost. + InLookAheadValues[V1] = Lane1; + InLookAheadValues[V2] = Lane2; + + // Contains the I2 operand indexes that got matched with I1 operands. + SmallSet<unsigned, 4> Op2Used; + + // Recursion towards the operands of I1 and I2. We are trying all possbile + // operand pairs, and keeping track of the best score. + for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); + OpIdx1 != NumOperands1; ++OpIdx1) { + // Try to pair op1I with the best operand of I2. + int MaxTmpScore = 0; + unsigned MaxOpIdx2 = 0; + bool FoundBest = false; + // If I2 is commutative try all combinations. + unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; + unsigned ToIdx = isCommutative(I2) + ? I2->getNumOperands() + : std::min(I2->getNumOperands(), OpIdx1 + 1); + assert(FromIdx <= ToIdx && "Bad index"); + for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { + // Skip operands already paired with OpIdx1. + if (Op2Used.count(OpIdx2)) + continue; + // Recursively calculate the cost at each level + int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, + {I2->getOperand(OpIdx2), Lane2}, + CurrLevel + 1, MaxLevel); + // Look for the best score. + if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + MaxTmpScore = TmpScore; + MaxOpIdx2 = OpIdx2; + FoundBest = true; + } + } + if (FoundBest) { + // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. + Op2Used.insert(MaxOpIdx2); + ShallowScoreAtThisLevel += MaxTmpScore; + } + } + return ShallowScoreAtThisLevel; + } + + /// \Returns the look-ahead score, which tells us how much the sub-trees + /// rooted at \p LHS and \p RHS match, the more they match the higher the + /// score. This helps break ties in an informed way when we cannot decide on + /// the order of the operands by just considering the immediate + /// predecessors. + int getLookAheadScore(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + InLookAheadValues.clear(); + return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); + } + // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. @@ -763,9 +1008,6 @@ public: // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; - const unsigned BestScore = 2; - const unsigned GoodScore = 1; - // The best operand index and its score. // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. @@ -794,41 +1036,19 @@ public: // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: - if (isa<LoadInst>(Op)) { - // Figure out which is left and right, so that we can check for - // consecutive loads - bool LeftToRight = Lane > LastLane; - Value *OpLeft = (LeftToRight) ? OpLastLane : Op; - Value *OpRight = (LeftToRight) ? Op : OpLastLane; - if (isConsecutiveAccess(cast<LoadInst>(OpLeft), - cast<LoadInst>(OpRight), DL, SE)) - BestOp.Idx = Idx; - } - break; - case ReorderingMode::Opcode: - // We accept both Instructions and Undefs, but with different scores. - if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) && - cast<Instruction>(Op)->getOpcode() == - cast<Instruction>(OpLastLane)->getOpcode()) || - (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) || - isa<UndefValue>(Op)) { - // An instruction has a higher score than an undef. - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } - } - break; case ReorderingMode::Constant: - if (isa<Constant>(Op)) { - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } + case ReorderingMode::Opcode: { + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + unsigned Score = + getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; } break; + } case ReorderingMode::Splat: if (Op == OpLastLane) BestOp.Idx = Idx; @@ -959,8 +1179,8 @@ public: public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, - ScalarEvolution &SE) - : DL(DL), SE(SE) { + ScalarEvolution &SE, const BoUpSLP &R) + : DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1189,7 +1409,8 @@ private: SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE); + ScalarEvolution &SE, + const BoUpSLP &R); struct TreeEntry { using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -1211,7 +1432,8 @@ private: Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - bool NeedToGather = false; + enum EntryState { Vectorize, NeedToGather }; + EntryState State; /// Does this sequence require some shuffling? SmallVector<unsigned, 4> ReuseShuffleIndices; @@ -1353,15 +1575,30 @@ private: dbgs() << "Scalars: \n"; for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; - dbgs() << "NeedToGather: " << NeedToGather << "\n"; - dbgs() << "MainOp: " << *MainOp << "\n"; - dbgs() << "AltOp: " << *AltOp << "\n"; + dbgs() << "State: "; + switch (State) { + case Vectorize: + dbgs() << "Vectorize\n"; + break; + case NeedToGather: + dbgs() << "NeedToGather\n"; + break; + } + dbgs() << "MainOp: "; + if (MainOp) + dbgs() << *MainOp << "\n"; + else + dbgs() << "NULL\n"; + dbgs() << "AltOp: "; + if (AltOp) + dbgs() << *AltOp << "\n"; + else + dbgs() << "NULL\n"; dbgs() << "VectorizedValue: "; if (VectorizedValue) - dbgs() << *VectorizedValue; + dbgs() << *VectorizedValue << "\n"; else - dbgs() << "NULL"; - dbgs() << "\n"; + dbgs() << "NULL\n"; dbgs() << "ReuseShuffleIndices: "; if (ReuseShuffleIndices.empty()) dbgs() << "Emtpy"; @@ -1392,7 +1629,7 @@ private: TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->NeedToGather = !Vectorized; + Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; @@ -1721,7 +1958,7 @@ private: return nullptr; } - bool isInSchedulingRegion(ScheduleData *SD) { + bool isInSchedulingRegion(ScheduleData *SD) const { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -2063,7 +2300,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *) { - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) return "color=red"; return ""; } @@ -2115,7 +2352,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; // For each lane: @@ -2152,7 +2389,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!UseEntry->NeedToGather && "Bad state"); + assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); continue; } } @@ -2448,7 +2685,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -2543,7 +2780,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -2590,7 +2827,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2637,9 +2874,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } // We don't combine GEPs with non-constant indexes. + Type *Ty1 = VL0->getOperand(1)->getType(); for (Value *V : VL) { auto Op = cast<Instruction>(V)->getOperand(1); - if (!isa<ConstantInt>(Op)) { + if (!isa<ConstantInt>(Op) || + (Op->getType() != Ty1 && + Op->getType()->getScalarSizeInBits() > + DL->getIndexSizeInBits( + V->getType()->getPointerAddressSpace()))) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); @@ -2665,24 +2907,74 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::Store: { // Check if the stores are consecutive or if we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { + llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); + // Make sure all stores in the bundle are simple - we can't vectorize + // atomic or volatile stores. + SmallVector<Value *, 4> PointerOps(VL.size()); + ValueList Operands(VL.size()); + auto POIter = PointerOps.begin(); + auto OIter = Operands.begin(); + for (Value *V : VL) { + auto *SI = cast<StoreInst>(V); + if (!SI->isSimple()) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); return; } + *POIter = SI->getPointerOperand(); + *OIter = SI->getValueOperand(); + ++POIter; + ++OIter; + } - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + OrdersType CurrentOrder; + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + const SCEV *Scev0 = SE->getSCEV(Ptr0); + const SCEV *ScevN = SE->getSCEV(PtrN); + const auto *Diff = + dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); + uint64_t Size = DL->getTypeAllocSize(ScalarTy); + // Check that the sorted pointer operands are consecutive. + if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (CurrentOrder.empty()) { + // Original stores are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + } else { + // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++(I->getSecond()); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + } + return; + } + } - ValueList Operands; - for (Value *V : VL) - Operands.push_back(cast<Instruction>(V)->getOperand(0)); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } case Instruction::Call: { @@ -2777,7 +3069,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2806,27 +3098,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { - unsigned N; - Type *EltTy; - auto *ST = dyn_cast<StructType>(T); - if (ST) { - N = ST->getNumElements(); - EltTy = *ST->element_begin(); - } else { - N = cast<ArrayType>(T)->getNumElements(); - EltTy = cast<ArrayType>(T)->getElementType(); + unsigned N = 1; + Type *EltTy = T; + + while (isa<CompositeType>(EltTy)) { + if (auto *ST = dyn_cast<StructType>(EltTy)) { + // Check that struct is homogeneous. + for (const auto *Ty : ST->elements()) + if (Ty != *ST->element_begin()) + return 0; + N *= ST->getNumElements(); + EltTy = *ST->element_begin(); + } else { + auto *SeqT = cast<SequentialType>(EltTy); + N *= SeqT->getNumElements(); + EltTy = SeqT->getElementType(); + } } + if (!isValidElementType(EltTy)) return 0; uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) return 0; - if (ST) { - // Check that struct is homogeneous. - for (const auto *Ty : ST->elements()) - if (Ty != EltTy) - return 0; - } return N; } @@ -2927,7 +3221,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->NeedToGather) { + if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { @@ -2995,7 +3289,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { int DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. @@ -3135,13 +3429,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost( + E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3162,7 +3456,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - unsigned alignment = cast<LoadInst>(VL0)->getAlignment(); + MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment()); int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { @@ -3180,15 +3474,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. - unsigned alignment = cast<StoreInst>(VL0)->getAlignment(); + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = + cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); + MaybeAlign Alignment(SI->getAlignment()); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; - } + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0); + if (NeedToShuffleReuses) + ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = - TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, + VecTy, Alignment, 0, VL0); + if (IsReorder) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + VecStCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -3274,20 +3575,22 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather) + if (VectorizableTree.size() == 1 && + VectorizableTree[0]->State == TreeEntry::Vectorize) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0]->NeedToGather && + if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather) + if (VectorizableTree[0]->State == TreeEntry::NeedToGather || + VectorizableTree[1]->State == TreeEntry::NeedToGather) return false; return true; @@ -3397,7 +3700,7 @@ int BoUpSLP::getSpillCost() const { continue; } - // Debug informations don't impact spill cost. + // Debug information does not impact spill cost. if ((isa<CallInst>(&*PrevInstIt) && !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && &*PrevInstIt != PrevInst) @@ -3441,12 +3744,13 @@ int BoUpSLP::getTreeCost() { // their uses. Since such an approach results in fewer total entries, // existing heuristics based on tree size may yield different results. // - if (TE.NeedToGather && - std::any_of( - std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), - [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { - return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); - })) + if (TE.State == TreeEntry::NeedToGather && + std::any_of(std::next(VectorizableTree.begin(), I + 1), + VectorizableTree.end(), + [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { + return EntryPtr->State == TreeEntry::NeedToGather && + EntryPtr->isSame(TE.Scalars); + })) continue; int C = getEntryCost(&TE); @@ -3538,13 +3842,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode( - ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE) { +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + const DataLayout &DL, + ScalarEvolution &SE, + const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE); + VLOperands Ops(VL, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); @@ -3735,7 +4041,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->NeedToGather) { + if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { @@ -3790,7 +4096,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractElement: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { Value *V = E->getSingleOperand(0); if (!E->ReorderIndices.empty()) { OrdersType Mask; @@ -3823,7 +4129,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ExtractValue: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); @@ -4050,15 +4356,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(VL0); + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = cast<StoreInst>( + IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); + if (IsReorder) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices, + "reorder_shuffle"); + } Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); + Value *VecPtr = Builder.CreateBitCast( + ScalarPtr, VecValue->getType()->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); // The pointer operand uses an in-tree scalar, so add the new BitCast to @@ -4088,7 +4404,22 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { std::vector<Value *> OpVecs; for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; ++j) { - Value *OpVec = vectorizeTree(E->getOperand(j)); + ValueList &VL = E->getOperand(j); + // Need to cast all elements to the same type before vectorization to + // avoid crash. + Type *VL0Ty = VL0->getOperand(j)->getType(); + Type *Ty = llvm::all_of( + VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); }) + ? VL0Ty + : DL->getIndexType(cast<GetElementPtrInst>(VL0) + ->getPointerOperandType() + ->getScalarType()); + for (Value *&V : VL) { + auto *CI = cast<ConstantInt>(V); + V = ConstantExpr::getIntegerCast(CI, Ty, + CI->getValue().isSignBitSet()); + } + Value *OpVec = vectorizeTree(VL); OpVecs.push_back(OpVec); } @@ -4284,7 +4615,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(!E->NeedToGather && "Extracting from a gather list"); + assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -4357,7 +4688,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -5332,125 +5663,140 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, } bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, - unsigned VecRegSize) { - const unsigned ChainLen = Chain.size(); - LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + unsigned Idx) { + LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() << "\n"); const unsigned Sz = R.getVectorElementSize(Chain[0]); - const unsigned VF = VecRegSize / Sz; + const unsigned MinVF = R.getMinVecRegSize() / Sz; + unsigned VF = Chain.size(); - if (!isPowerOf2_32(Sz) || VF < 2) + if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) return false; - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { - - ArrayRef<Value *> Operands = Chain.slice(i, VF); - // Check that a previous iteration of this loop did not delete the Value. - if (llvm::any_of(Operands, [&R](Value *V) { - auto *I = dyn_cast<Instruction>(V); - return I && R.isDeleted(I); - })) - continue; - - LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - - R.buildTree(Operands); - if (R.isTreeTinyAndNotFullyVectorizable()) - continue; + LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx + << "\n"); - R.computeMinimumValueSizes(); + R.buildTree(Chain); + Optional<ArrayRef<unsigned>> Order = R.bestOrder(); + // TODO: Handle orders of size less than number of elements in the vector. + if (Order && Order->size() == Chain.size()) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend()); + llvm::transform(*Order, ReorderedOps.begin(), + [Chain](const unsigned Idx) { return Chain[Idx]; }); + R.buildTree(ReorderedOps); + } + if (R.isTreeTinyAndNotFullyVectorizable()) + return false; - int Cost = R.getTreeCost(); + R.computeMinimumValueSizes(); - LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF - << "\n"); - if (Cost < -SLPCostThreshold) { - LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + int Cost = R.getTreeCost(); - using namespace ore; + LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", - cast<StoreInst>(Chain[i])) - << "Stores SLP vectorized with cost " << NV("Cost", Cost) - << " and with tree size " - << NV("TreeSize", R.getTreeSize())); + using namespace ore; - R.vectorizeTree(); + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", + cast<StoreInst>(Chain[0])) + << "Stores SLP vectorized with cost " << NV("Cost", Cost) + << " and with tree size " + << NV("TreeSize", R.getTreeSize())); - // Move to the next bundle. - i += VF - 1; - Changed = true; - } + R.vectorizeTree(); + return true; } - return Changed; + return false; } bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP &R) { - SetVector<StoreInst *> Heads; - SmallDenseSet<StoreInst *> Tails; - SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; - // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - auto &&FindConsecutiveAccess = - [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) { - if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) - return false; - - Tails.insert(Stores[Idx]); - Heads.insert(Stores[K]); - ConsecutiveChain[Stores[K]] = Stores[Idx]; - return true; - }; + int E = Stores.size(); + SmallBitVector Tails(E, false); + SmallVector<int, 16> ConsecutiveChain(E, E + 1); + int MaxIter = MaxStoreLookup.getValue(); + int IterCnt; + auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, + &ConsecutiveChain](int K, int Idx) { + if (IterCnt >= MaxIter) + return true; + ++IterCnt; + if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + return false; + Tails.set(Idx); + ConsecutiveChain[K] = Idx; + return true; + }; // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. - int E = Stores.size(); for (int Idx = E - 1; Idx >= 0; --Idx) { // If a store has multiple consecutive store candidates, search according // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... // This is because usually pairing with immediate succeeding or preceding // candidate create the best chance to find slp vectorization opportunity. - for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset) + const int MaxLookDepth = std::max(E - Idx, Idx + 1); + IterCnt = 0; + for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) break; } // For stores that start but don't end a link in the chain: - for (auto *SI : llvm::reverse(Heads)) { - if (Tails.count(SI)) + for (int Cnt = E; Cnt > 0; --Cnt) { + int I = Cnt - 1; + if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) continue; - // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - StoreInst *I = SI; // Collect the chain into a list. - while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) { - Operands.push_back(I); + while (I != E + 1 && !VectorizedStores.count(Stores[I])) { + Operands.push_back(Stores[I]); // Move to the next value in the chain. I = ConsecutiveChain[I]; } + // If a vector register can't hold 1 element, we are done. + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Stores[0]); + if (MaxVecRegSize % EltSize != 0) + continue; + + unsigned MaxElts = MaxVecRegSize / EltSize; // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? - for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); - Size /= 2) { - if (vectorizeStoreChain(Operands, R, Size)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed = true; - break; + unsigned StartIdx = 0; + for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + vectorizeStoreChain(Slice, R, Cnt)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; + } + ++Cnt; } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) + break; } } @@ -5835,38 +6181,36 @@ class HorizontalReduction { explicit operator bool() const { return Opcode; } - /// Get the index of the first operand. - unsigned getFirstOperandIndex() const { - assert(!!*this && "The opcode is not set."); + /// Return true if this operation is any kind of minimum or maximum. + bool isMinMax() const { switch (Kind) { + case RK_Arithmetic: + return false; case RK_Min: - case RK_UMin: case RK_Max: + case RK_UMin: case RK_UMax: - return 1; - case RK_Arithmetic: + return true; case RK_None: break; } - return 0; + llvm_unreachable("Reduction kind is not set"); + } + + /// Get the index of the first operand. + unsigned getFirstOperandIndex() const { + assert(!!*this && "The opcode is not set."); + // We allow calling this before 'Kind' is set, so handle that specially. + if (Kind == RK_None) + return 0; + return isMinMax() ? 1 : 0; } /// Total number of operands in the reduction operation. unsigned getNumberOfOperands() const { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - return 2; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: - return 3; - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return isMinMax() ? 3 : 2; } /// Checks if the operation has the same parent as \p P. @@ -5875,79 +6219,46 @@ class HorizontalReduction { "Expected reduction operation."); if (!IsRedOp) return I->getParent() == P; - switch (Kind) { - case RK_Arithmetic: - // Arithmetic reduction operation must be used once only. - return I->getParent() == P; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: { + if (isMinMax()) { // SelectInst must be used twice while the condition op must have single // use only. auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); return I->getParent() == P && Cmp && Cmp->getParent() == P; } - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + // Arithmetic reduction operation must be used once only. + return I->getParent() == P; } + /// Expected number of uses for reduction operations/reduced values. bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - return I->hasOneUse(); - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) return I->hasNUses(2) && (!IsReductionOp || cast<SelectInst>(I)->getCondition()->hasOneUse()); - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return I->hasOneUse(); } /// Initializes the list of reduction operations. void initReductionOps(ReductionOpsListType &ReductionOps) { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - ReductionOps.assign(1, ReductionOpsType()); - break; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) ReductionOps.assign(2, ReductionOpsType()); - break; - case RK_None: - llvm_unreachable("Reduction kind is not set"); - } + else + ReductionOps.assign(1, ReductionOpsType()); } + /// Add all reduction operations for the reduction instruction \p I. void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - ReductionOps[0].emplace_back(I); - break; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) { ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); ReductionOps[1].emplace_back(I); - break; - case RK_None: - llvm_unreachable("Reduction kind is not set"); + } else { + ReductionOps[0].emplace_back(I); } } @@ -5980,12 +6291,12 @@ class HorizontalReduction { /// Checks if two operation data are both a reduction op or both a reduced /// value. - bool operator==(const OperationData &OD) { + bool operator==(const OperationData &OD) const { assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."); return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode); } - bool operator!=(const OperationData &OD) { return !(*this == OD); } + bool operator!=(const OperationData &OD) const { return !(*this == OD); } void clear() { Opcode = 0; LHS = nullptr; @@ -6005,18 +6316,7 @@ class HorizontalReduction { Value *getLHS() const { return LHS; } Value *getRHS() const { return RHS; } Type *getConditionType() const { - switch (Kind) { - case RK_Arithmetic: - return nullptr; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: - return CmpInst::makeCmpResultType(LHS->getType()); - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr; } /// Creates reduction operation with the current opcode with the IR flags @@ -6400,6 +6700,18 @@ public: assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); } + + // The compare instruction of a min/max is the insertion point for new + // instructions and may be replaced with a new compare instruction. + auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { + assert(isa<SelectInst>(RdxRootInst) && + "Expected min/max reduction to have select root instruction"); + Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); + assert(isa<Instruction>(ScalarCond) && + "Expected min/max reduction to have compare condition"); + return cast<Instruction>(ScalarCond); + }; + // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; @@ -6455,8 +6767,14 @@ public: DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); - // Emit a reduction. - Builder.SetInsertPoint(cast<Instruction>(ReductionRoot)); + // Emit a reduction. For min/max, the root is a select, but the insertion + // point is the compare condition of that select. + Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); + if (ReductionData.isMinMax()) + Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); + else + Builder.SetInsertPoint(RdxRootInst); + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); if (VectorizedTree) { @@ -6492,8 +6810,20 @@ public: VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I); } } - // Update users. + + // Update users. For a min/max reduction that ends with a compare and + // select, we also have to RAUW for the compare instruction feeding the + // reduction root. That's because the original compare may have extra uses + // besides the final select of the reduction. + if (ReductionData.isMinMax()) { + if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) { + Instruction *ScalarCmp = + getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot)); + ScalarCmp->replaceAllUsesWith(VecSelect->getCondition()); + } + } ReductionRoot->replaceAllUsesWith(VectorizedTree); + // Mark all scalar reduction ops for deletion, they are replaced by the // vector reductions. V.eraseInstructions(IgnoreList); @@ -6619,45 +6949,54 @@ private: /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 -/// starting from the last insertelement instruction. +/// starting from the last insertelement or insertvalue instruction. /// -/// Returns true if it matches -static bool findBuildVector(InsertElementInst *LastInsertElem, - TargetTransformInfo *TTI, - SmallVectorImpl<Value *> &BuildVectorOpds, - int &UserCost) { - UserCost = 0; - Value *V = nullptr; - do { - if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) { - UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, - LastInsertElem->getType(), - CI->getZExtValue()); - } - BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); - V = LastInsertElem->getOperand(0); - if (isa<UndefValue>(V)) - break; - LastInsertElem = dyn_cast<InsertElementInst>(V); - if (!LastInsertElem || !LastInsertElem->hasOneUse()) - return false; - } while (true); - std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); - return true; -} - -/// Like findBuildVector, but looks for construction of aggregate. +/// Also recognize aggregates like {<2 x float>, <2 x float>}, +/// {{float, float}, {float, float}}, [2 x {float, float}] and so on. +/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. +/// +/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. /// /// \return true if it matches. -static bool findBuildAggregate(InsertValueInst *IV, - SmallVectorImpl<Value *> &BuildVectorOpds) { +static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, + SmallVectorImpl<Value *> &BuildVectorOpds, + int &UserCost) { + assert((isa<InsertElementInst>(LastInsertInst) || + isa<InsertValueInst>(LastInsertInst)) && + "Expected insertelement or insertvalue instruction!"); + UserCost = 0; do { - BuildVectorOpds.push_back(IV->getInsertedValueOperand()); - Value *V = IV->getAggregateOperand(); - if (isa<UndefValue>(V)) + Value *InsertedOperand; + if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) { + InsertedOperand = IE->getOperand(1); + LastInsertInst = IE->getOperand(0); + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { + UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, + IE->getType(), CI->getZExtValue()); + } + } else { + auto *IV = cast<InsertValueInst>(LastInsertInst); + InsertedOperand = IV->getInsertedValueOperand(); + LastInsertInst = IV->getAggregateOperand(); + } + if (isa<InsertElementInst>(InsertedOperand) || + isa<InsertValueInst>(InsertedOperand)) { + int TmpUserCost; + SmallVector<Value *, 8> TmpBuildVectorOpds; + if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, + TmpUserCost)) + return false; + BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), + TmpBuildVectorOpds.rend()); + UserCost += TmpUserCost; + } else { + BuildVectorOpds.push_back(InsertedOperand); + } + if (isa<UndefValue>(LastInsertInst)) break; - IV = dyn_cast<InsertValueInst>(V); - if (!IV || !IV->hasOneUse()) + if ((!isa<InsertValueInst>(LastInsertInst) && + !isa<InsertElementInst>(LastInsertInst)) || + !LastInsertInst->hasOneUse()) return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); @@ -6825,25 +7164,26 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, BoUpSLP &R) { + int UserCost = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); if (!R.canMapToVector(IVI->getType(), DL)) return false; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IVI, BuildVectorOpds)) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost)) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R); + return tryToVectorizeList(BuildVectorOpds, R, UserCost); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { int UserCost; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && isShuffle(BuildVectorOpds))) @@ -7118,14 +7458,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << it->second.size() << ".\n"); - // Process the stores in chunks of 16. - // TODO: The limit of 16 inhibits greater vectorization factors. - // For example, AVX2 supports v32i8. Increasing this limit, however, - // may cause a significant compile-time increase. - for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) { - unsigned Len = std::min<unsigned>(CE - CI, 16); - Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R); - } + Changed |= vectorizeStores(it->second, R); } return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 0ca6a6b93cfd..598fb00e956e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -47,6 +47,24 @@ class VPRecipeBuilder { EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; + // VPlan-VPlan transformations support: Hold a mapping from ingredients to + // their recipe. To save on memory, only do so for selected ingredients, + // marked by having a nullptr entry in this map. If those ingredients get a + // VPWidenRecipe, also avoid compressing other ingredients into it to avoid + // having to split such recipes later. + DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe; + VPWidenRecipe *LastExtensibleRecipe = nullptr; + + /// Set the recipe created for given ingredient. This operation is a no-op for + /// ingredients that were not marked using a nullptr entry in the map. + void setRecipe(Instruction *I, VPRecipeBase *R) { + if (!Ingredient2Recipe.count(I)) + return; + assert(Ingredient2Recipe[I] == nullptr && + "Recipe already set for ingredient"); + Ingredient2Recipe[I] = R; + } + public: /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* @@ -57,16 +75,22 @@ public: /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); - /// Check if \I belongs to an Interleave Group within the given VF \p Range, - /// \return true in the first returned value if so and false otherwise. - /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG - /// for \p Range.Start, and provide it as the second returned value. - /// Note that if \I is an adjunct member of an IG for \p Range.Start, the - /// \return value is <true, nullptr>, as it is handled by another recipe. - /// \p Range.End may be decreased to ensure same decision from \p Range.Start - /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan); + /// Mark given ingredient for recording its recipe once one is created for + /// it. + void recordRecipeOf(Instruction *I) { + assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) && + "Recipe already set for ingredient"); + Ingredient2Recipe[I] = nullptr; + } + + /// Return the recipe created for given ingredient. + VPRecipeBase *getRecipe(Instruction *I) { + assert(Ingredient2Recipe.count(I) && + "Recording this ingredients recipe was not requested"); + assert(Ingredient2Recipe[I] != nullptr && + "Ingredient doesn't have a recipe"); + return Ingredient2Recipe[I]; + } /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4b80d1fb20aa..f1c708720ccf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTreeConstruction.h" @@ -275,18 +276,35 @@ void VPRegionBlock::execute(VPTransformState *State) { } void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); Parent = InsertPos->getParent(); Parent->getRecipeList().insert(InsertPos->getIterator(), this); } +void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); + Parent = InsertPos->getParent(); + Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); +} + +void VPRecipeBase::removeFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); + getParent()->getRecipeList().remove(getIterator()); + Parent = nullptr; +} + iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); return getParent()->getRecipeList().erase(getIterator()); } void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { - InsertPos->getParent()->getRecipeList().splice( - std::next(InsertPos->getIterator()), getParent()->getRecipeList(), - getIterator()); + removeFromParent(); + insertAfter(InsertPos); } void VPInstruction::generateInstruction(VPTransformState &State, @@ -447,14 +465,20 @@ void VPlan::execute(VPTransformState *State) { // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, + L->getExitBlock()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void VPlan::dump() const { dbgs() << *this << '\n'; } +#endif + void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB) { + BasicBlock *LoopLatchBB, + BasicBlock *LoopExitBB) { BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); assert(LoopHeaderBB && "Loop preheader does not have a single successor."); - DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); // The vector body may be more than a single basic-block by this point. // Update the dominator tree information inside the vector body by propagating // it from header to latch, expecting only triangular control-flow, if any. @@ -485,6 +509,9 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, DT->addNewBlock(InterimSucc, BB); DT->addNewBlock(PostDomSucc, BB); } + // Latch block is a new dominator for the loop exit. + DT->changeImmediateDominator(LoopExitBB, LoopLatchBB); + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { @@ -509,8 +536,7 @@ void VPlanPrinter::dump() { if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) { OS << ", where:"; if (Plan.BackedgeTakenCount) - OS << "\\n" - << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount"; + OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount"; for (auto Entry : Plan.Value2VPValue) { OS << "\\n" << *Entry.second; OS << DOT::EscapeString(" := "); @@ -522,7 +548,7 @@ void VPlanPrinter::dump() { OS << "edge [fontname=Courier, fontsize=30]\n"; OS << "compound=true\n"; - for (VPBlockBase *Block : depth_first(Plan.getEntry())) + for (const VPBlockBase *Block : depth_first(Plan.getEntry())) dumpBlock(Block); OS << "}\n"; @@ -661,6 +687,16 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, O << " " << VPlanIngredient(IV) << "\\l\""; } +void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN-GEP "; + O << (IsPtrLoopInvariant ? "Inv" : "Var"); + size_t IndicesNumber = IsIndexLoopInvariant.size(); + for (size_t I = 0; I < IndicesNumber; ++I) + O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\""; +} + void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; } @@ -703,9 +739,12 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 44d8a198f27e..c65abc3639d7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -31,6 +31,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -226,6 +227,8 @@ public: struct VPCallback { virtual ~VPCallback() {} virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; + virtual Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) = 0; }; /// VPTransformState holds information passed down when "executing" a VPlan, @@ -268,6 +271,13 @@ struct VPTransformState { return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); } + /// Get the generated Value for a given VPValue and given Part and Lane. Note + /// that as per-lane Defs are still created by ILV and managed in its ValueMap + /// this method currently just delegates the call to ILV. + Value *get(VPValue *Def, const VPIteration &Instance) { + return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); + } + /// Set the generated Value for a given VPValue and a given Part. void set(VPValue *Def, Value *V, unsigned Part) { if (!Data.PerPartOutput.count(Def)) { @@ -567,6 +577,7 @@ public: /// instructions. class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> { friend VPBasicBlock; + friend class VPBlockUtils; private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -586,6 +597,7 @@ public: VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, + VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, VPWidenPHISC, @@ -615,10 +627,18 @@ public: /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); + /// Insert an unlinked Recipe into a basic block immediately after + /// the specified Recipe. + void insertAfter(VPRecipeBase *InsertPos); + /// Unlink this recipe from its current VPBasicBlock and insert it into /// the VPBasicBlock that MovePos lives in, right after MovePos. void moveAfter(VPRecipeBase *MovePos); + /// This method unlinks 'this' from the containing basic block, but does not + /// delete it. + void removeFromParent(); + /// This method unlinks 'this' from the containing basic block and deletes it. /// /// \returns an iterator pointing to the element after the erased one @@ -630,7 +650,6 @@ public: /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPUser, public VPRecipeBase { - friend class VPlanHCFGTransforms; friend class VPlanSlp; public: @@ -740,6 +759,36 @@ public: void print(raw_ostream &O, const Twine &Indent) const override; }; +/// A recipe for handling GEP instructions. +class VPWidenGEPRecipe : public VPRecipeBase { +private: + GetElementPtrInst *GEP; + bool IsPtrLoopInvariant; + SmallBitVector IsIndexLoopInvariant; + +public: + VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop) + : VPRecipeBase(VPWidenGEPSC), GEP(GEP), + IsIndexLoopInvariant(GEP->getNumIndices(), false) { + IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); + for (auto Index : enumerate(GEP->indices())) + IsIndexLoopInvariant[Index.index()] = + OrigLoop->isLoopInvariant(Index.value().get()); + } + ~VPWidenGEPRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC; + } + + /// Generate the gep nodes. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { @@ -822,13 +871,14 @@ public: class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup<Instruction> *IG; - std::unique_ptr<VPUser> User; + VPUser User; public: - VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) { + if (Mask) + User.addOperand(Mask); } ~VPInterleaveRecipe() override = default; @@ -837,6 +887,18 @@ public: return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + } + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; @@ -959,13 +1021,14 @@ public: class VPWidenMemoryInstructionRecipe : public VPRecipeBase { private: Instruction &Instr; - std::unique_ptr<VPUser> User; + VPUser User; public: - VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) { + if (Mask) + User.addOperand(Mask); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -973,6 +1036,18 @@ public: return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + } + /// Generate the wide load/store. void execute(VPTransformState &State) override; @@ -1143,6 +1218,128 @@ public: void execute(struct VPTransformState *State) override; }; +//===----------------------------------------------------------------------===// +// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs // +//===----------------------------------------------------------------------===// + +// The following set of template specializations implement GraphTraits to treat +// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note +// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the +// VPBlockBase is a VPRegionBlock, this specialization provides access to its +// successors/predecessors but not to the blocks inside the region. + +template <> struct GraphTraits<VPBlockBase *> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +template <> struct GraphTraits<const VPBlockBase *> { + using NodeRef = const VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +// Inverse order specialization for VPBasicBlocks. Predecessors are used instead +// of successors for the inverse traversal. +template <> struct GraphTraits<Inverse<VPBlockBase *>> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getPredecessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getPredecessors().end(); + } +}; + +// The following set of template specializations implement GraphTraits to +// treat VPRegionBlock as a graph and recurse inside its nodes. It's important +// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases +// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so +// there won't be automatic recursion into other VPBlockBases that turn to be +// VPRegionBlocks. + +template <> +struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> { + using GraphRef = VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getEntry()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + +template <> +struct GraphTraits<const VPRegionBlock *> + : public GraphTraits<const VPBlockBase *> { + using GraphRef = const VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getEntry()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + +template <> +struct GraphTraits<Inverse<VPRegionBlock *>> + : public GraphTraits<Inverse<VPBlockBase *>> { + using GraphRef = VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(Inverse<GraphRef> N) { + return N.Graph->getExit(); + } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getExit()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a @@ -1245,35 +1442,45 @@ public: return Value2VPValue[V]; } + VPValue *getOrAddVPValue(Value *V) { + assert(V && "Trying to get or add the VPValue of a null Value"); + if (!Value2VPValue.count(V)) + addVPValue(V); + return getVPValue(V); + } + /// Return the VPLoopInfo analysis for this VPlan. VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } + /// Dump the plan to stderr (for debugging). + void dump() const; + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. - static void updateDominatorTree(DominatorTree *DT, + static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB); + BasicBlock *LoopExitBB); }; /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { - friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan); + friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); friend inline raw_ostream &operator<<(raw_ostream &OS, const struct VPlanIngredient &I); private: raw_ostream &OS; - VPlan &Plan; - unsigned Depth; + const VPlan &Plan; + unsigned Depth = 0; unsigned TabWidth = 2; std::string Indent; unsigned BID = 0; SmallDenseMap<const VPBlockBase *, unsigned> BlockID; - VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {} + VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {} /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1320,135 +1527,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { return OS; } -inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) { +inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { VPlanPrinter Printer(OS, Plan); Printer.dump(); return OS; } //===----------------------------------------------------------------------===// -// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs // -//===----------------------------------------------------------------------===// - -// The following set of template specializations implement GraphTraits to treat -// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note -// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the -// VPBlockBase is a VPRegionBlock, this specialization provides access to its -// successors/predecessors but not to the blocks inside the region. - -template <> struct GraphTraits<VPBlockBase *> { - using NodeRef = VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -template <> struct GraphTraits<const VPBlockBase *> { - using NodeRef = const VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -// Inverse order specialization for VPBasicBlocks. Predecessors are used instead -// of successors for the inverse traversal. -template <> struct GraphTraits<Inverse<VPBlockBase *>> { - using NodeRef = VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; - - static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getPredecessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getPredecessors().end(); - } -}; - -// The following set of template specializations implement GraphTraits to -// treat VPRegionBlock as a graph and recurse inside its nodes. It's important -// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases -// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so -// there won't be automatic recursion into other VPBlockBases that turn to be -// VPRegionBlocks. - -template <> -struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> { - using GraphRef = VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getEntry()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -template <> -struct GraphTraits<const VPRegionBlock *> - : public GraphTraits<const VPBlockBase *> { - using GraphRef = const VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getEntry()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -template <> -struct GraphTraits<Inverse<VPRegionBlock *>> - : public GraphTraits<Inverse<VPBlockBase *>> { - using GraphRef = VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(Inverse<GraphRef> N) { - return N.Graph->getExit(); - } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getExit()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -//===----------------------------------------------------------------------===// // VPlan Utilities //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b22d3190d654..3f6a2efd55cc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1,4 +1,4 @@ -//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===// +//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,13 +11,13 @@ /// //===----------------------------------------------------------------------===// -#include "VPlanHCFGTransforms.h" +#include "VPlanTransforms.h" #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; -void VPlanHCFGTransforms::VPInstructionsToVPRecipes( - VPlanPtr &Plan, +void VPlanTransforms::VPInstructionsToVPRecipes( + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions) { @@ -56,7 +56,9 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( VPRecipeBase *NewRecipe = nullptr; // Create VPWidenMemoryInstructionRecipe for loads and stores. if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) - NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/); + NewRecipe = new VPWidenMemoryInstructionRecipe( + *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + nullptr /*Mask*/); else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) { InductionDescriptor II = Inductions->lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || @@ -64,6 +66,8 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); } else NewRecipe = new VPWidenPHIRecipe(Phi); + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { + NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop); } else { // If the last recipe is a VPWidenRecipe, add Inst to it instead of // creating a new recipe. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 79a23c33184f..0d3bd7da09a7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -1,4 +1,4 @@ -//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===// +//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,8 +10,8 @@ /// This file provides utility VPlan to VPlan transformations. //===----------------------------------------------------------------------===// -#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H -#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" #include "llvm/IR/Instruction.h" @@ -19,17 +19,17 @@ namespace llvm { -class VPlanHCFGTransforms { +class VPlanTransforms { public: /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void VPInstructionsToVPRecipes( - VPlanPtr &Plan, + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions); }; } // namespace llvm -#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h index 7b6c228c229e..464498c29d89 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -37,7 +37,7 @@ class VPUser; // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; - friend class VPlanHCFGTransforms; + friend class VPlanTransforms; friend class VPBasicBlock; friend class VPInterleavedAccessInfo; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 394b1b93113b..ab3e7e2282e7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -14,6 +14,7 @@ #include "VPlanVerifier.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "loop-vectorize" |