diff options
Diffstat (limited to 'lib/Transforms/IPO')
20 files changed, 2765 insertions, 1404 deletions
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index 65b7bad3b1ed..a2c8a32dfe86 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -29,8 +29,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ArgumentPromotion.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -38,6 +39,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/LazyCallGraph.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/CFG.h" @@ -51,323 +53,400 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" #include <set> using namespace llvm; #define DEBUG_TYPE "argpromotion" -STATISTIC(NumArgumentsPromoted , "Number of pointer arguments promoted"); +STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted"); STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted"); -STATISTIC(NumByValArgsPromoted , "Number of byval arguments promoted"); -STATISTIC(NumArgumentsDead , "Number of dead pointer args eliminated"); +STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted"); +STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated"); -namespace { - /// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. - /// - struct ArgPromotion : public CallGraphSCCPass { - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - getAAResultsAnalysisUsage(AU); - CallGraphSCCPass::getAnalysisUsage(AU); - } +/// A vector used to hold the indices of a single GEP instruction +typedef std::vector<uint64_t> IndicesVector; - bool runOnSCC(CallGraphSCC &SCC) override; - static char ID; // Pass identification, replacement for typeid - explicit ArgPromotion(unsigned maxElements = 3) - : CallGraphSCCPass(ID), maxElements(maxElements) { - initializeArgPromotionPass(*PassRegistry::getPassRegistry()); - } +/// DoPromotion - This method actually performs the promotion of the specified +/// arguments, and returns the new function. At this point, we know that it's +/// safe to do so. +static Function * +doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, + SmallPtrSetImpl<Argument *> &ByValArgsToTransform, + Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>> + ReplaceCallSite) { - private: + // Start by computing a new prototype for the function, which is the same as + // the old function, but has modified arguments. + FunctionType *FTy = F->getFunctionType(); + std::vector<Type *> Params; - using llvm::Pass::doInitialization; - bool doInitialization(CallGraph &CG) override; - /// The maximum number of elements to expand, or 0 for unlimited. - unsigned maxElements; - }; -} + typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable; -/// A vector used to hold the indices of a single GEP instruction -typedef std::vector<uint64_t> IndicesVector; + // ScalarizedElements - If we are promoting a pointer that has elements + // accessed out of it, keep track of which elements are accessed so that we + // can add one argument for each. + // + // Arguments that are directly loaded will have a zero element value here, to + // handle cases where there are both a direct load and GEP accesses. + // + std::map<Argument *, ScalarizeTable> ScalarizedElements; -static CallGraphNode * -PromoteArguments(CallGraphNode *CGN, CallGraph &CG, - function_ref<AAResults &(Function &F)> AARGetter, - unsigned MaxElements); -static bool isDenselyPacked(Type *type, const DataLayout &DL); -static bool canPaddingBeAccessed(Argument *Arg); -static bool isSafeToPromoteArgument(Argument *Arg, bool isByVal, AAResults &AAR, - unsigned MaxElements); -static CallGraphNode * -DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, - SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG); + // OriginalLoads - Keep track of a representative load instruction from the + // original function so that we can tell the alias analysis implementation + // what the new GEP/Load instructions we are inserting look like. + // We need to keep the original loads for each argument and the elements + // of the argument that are accessed. + std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads; -char ArgPromotion::ID = 0; -INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", - "Promote 'by reference' arguments to scalars", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_END(ArgPromotion, "argpromotion", - "Promote 'by reference' arguments to scalars", false, false) + // Attribute - Keep track of the parameter attributes for the arguments + // that we are *not* promoting. For the ones that we do promote, the parameter + // attributes are lost + SmallVector<AttributeSet, 8> ArgAttrVec; + AttributeList PAL = F->getAttributes(); -Pass *llvm::createArgumentPromotionPass(unsigned maxElements) { - return new ArgPromotion(maxElements); -} + // First, determine the new argument list + unsigned ArgIndex = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++ArgIndex) { + if (ByValArgsToTransform.count(&*I)) { + // Simple byval argument? Just add all the struct element types. + Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + StructType *STy = cast<StructType>(AgTy); + Params.insert(Params.end(), STy->element_begin(), STy->element_end()); + ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(), + AttributeSet()); + ++NumByValArgsPromoted; + } else if (!ArgsToPromote.count(&*I)) { + // Unchanged argument + Params.push_back(I->getType()); + ArgAttrVec.push_back(PAL.getParamAttributes(ArgIndex)); + } else if (I->use_empty()) { + // Dead argument (which are always marked as promotable) + ++NumArgumentsDead; + } else { + // Okay, this is being promoted. This means that the only uses are loads + // or GEPs which are only used by loads -static bool runImpl(CallGraphSCC &SCC, CallGraph &CG, - function_ref<AAResults &(Function &F)> AARGetter, - unsigned MaxElements) { - bool Changed = false, LocalChange; + // In this table, we will track which indices are loaded from the argument + // (where direct loads are tracked as no indices). + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; + for (User *U : I->users()) { + Instruction *UI = cast<Instruction>(U); + Type *SrcTy; + if (LoadInst *L = dyn_cast<LoadInst>(UI)) + SrcTy = L->getType(); + else + SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType(); + IndicesVector Indices; + Indices.reserve(UI->getNumOperands() - 1); + // Since loads will only have a single operand, and GEPs only a single + // non-index operand, this will record direct loads without any indices, + // and gep+loads with the GEP indices. + for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end(); + II != IE; ++II) + Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); + // GEPs with a single 0 index can be merged with direct loads + if (Indices.size() == 1 && Indices.front() == 0) + Indices.clear(); + ArgIndices.insert(std::make_pair(SrcTy, Indices)); + LoadInst *OrigLoad; + if (LoadInst *L = dyn_cast<LoadInst>(UI)) + OrigLoad = L; + else + // Take any load, we will use it only to update Alias Analysis + OrigLoad = cast<LoadInst>(UI->user_back()); + OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad; + } - do { // Iterate until we stop promoting from this SCC. - LocalChange = false; - // Attempt to promote arguments from all functions in this SCC. - for (CallGraphNode *OldNode : SCC) { - if (CallGraphNode *NewNode = - PromoteArguments(OldNode, CG, AARGetter, MaxElements)) { - LocalChange = true; - SCC.ReplaceNode(OldNode, NewNode); + // Add a parameter to the function for each element passed in. + for (const auto &ArgIndex : ArgIndices) { + // not allowed to dereference ->begin() if size() is 0 + Params.push_back(GetElementPtrInst::getIndexedType( + cast<PointerType>(I->getType()->getScalarType())->getElementType(), + ArgIndex.second)); + ArgAttrVec.push_back(AttributeSet()); + assert(Params.back()); } + + if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty()) + ++NumArgumentsPromoted; + else + ++NumAggregatesPromoted; } - Changed |= LocalChange; // Remember that we changed something. - } while (LocalChange); - - return Changed; -} + } -bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { - if (skipSCC(SCC)) - return false; + Type *RetTy = FTy->getReturnType(); - // Get the callgraph information that we need to update to reflect our - // changes. - CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); + // Construct the new function type using the new arguments. + FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); - // We compute dedicated AA results for each function in the SCC as needed. We - // use a lambda referencing external objects so that they live long enough to - // be queried, but we re-use them each time. - Optional<BasicAAResult> BAR; - Optional<AAResults> AAR; - auto AARGetter = [&](Function &F) -> AAResults & { - BAR.emplace(createLegacyPMBasicAAResult(*this, F)); - AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); - return *AAR; - }; - - return runImpl(SCC, CG, AARGetter, maxElements); -} + // Create the new function body and insert it into the module. + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); + NF->copyAttributesFrom(F); -/// \brief Checks if a type could have padding bytes. -static bool isDenselyPacked(Type *type, const DataLayout &DL) { + // Patch the pointer to LLVM function in debug info descriptor. + NF->setSubprogram(F->getSubprogram()); + F->setSubprogram(nullptr); - // There is no size information, so be conservative. - if (!type->isSized()) - return false; + DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" + << "From: " << *F); - // If the alloc size is not equal to the storage size, then there are padding - // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128. - if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type)) - return false; + // Recompute the parameter attributes list based on the new arguments for + // the function. + NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(), + PAL.getRetAttributes(), ArgAttrVec)); + ArgAttrVec.clear(); - if (!isa<CompositeType>(type)) - return true; + F->getParent()->getFunctionList().insert(F->getIterator(), NF); + NF->takeName(F); - // For homogenous sequential types, check for padding within members. - if (SequentialType *seqTy = dyn_cast<SequentialType>(type)) - return isDenselyPacked(seqTy->getElementType(), DL); + // Loop over all of the callers of the function, transforming the call sites + // to pass in the loaded pointers. + // + SmallVector<Value *, 16> Args; + while (!F->use_empty()) { + CallSite CS(F->user_back()); + assert(CS.getCalledFunction() == F); + Instruction *Call = CS.getInstruction(); + const AttributeList &CallPAL = CS.getAttributes(); - // Check for padding within and between elements of a struct. - StructType *StructTy = cast<StructType>(type); - const StructLayout *Layout = DL.getStructLayout(StructTy); - uint64_t StartPos = 0; - for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) { - Type *ElTy = StructTy->getElementType(i); - if (!isDenselyPacked(ElTy, DL)) - return false; - if (StartPos != Layout->getElementOffsetInBits(i)) - return false; - StartPos += DL.getTypeAllocSizeInBits(ElTy); - } + // Loop over the operands, inserting GEP and loads in the caller as + // appropriate. + CallSite::arg_iterator AI = CS.arg_begin(); + ArgIndex = 1; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++AI, ++ArgIndex) + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { + Args.push_back(*AI); // Unmodified argument + ArgAttrVec.push_back(CallPAL.getAttributes(ArgIndex)); + } else if (ByValArgsToTransform.count(&*I)) { + // Emit a GEP and load for each element of the struct. + Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + StructType *STy = cast<StructType>(AgTy); + Value *Idxs[2] = { + ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr}; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); + Value *Idx = GetElementPtrInst::Create( + STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call); + // TODO: Tell AA about the new values? + Args.push_back(new LoadInst(Idx, Idx->getName() + ".val", Call)); + ArgAttrVec.push_back(AttributeSet()); + } + } else if (!I->use_empty()) { + // Non-dead argument: insert GEPs and loads as appropriate. + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; + // Store the Value* version of the indices in here, but declare it now + // for reuse. + std::vector<Value *> Ops; + for (const auto &ArgIndex : ArgIndices) { + Value *V = *AI; + LoadInst *OrigLoad = + OriginalLoads[std::make_pair(&*I, ArgIndex.second)]; + if (!ArgIndex.second.empty()) { + Ops.reserve(ArgIndex.second.size()); + Type *ElTy = V->getType(); + for (unsigned long II : ArgIndex.second) { + // Use i32 to index structs, and i64 for others (pointers/arrays). + // This satisfies GEP constraints. + Type *IdxTy = + (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext()) + : Type::getInt64Ty(F->getContext())); + Ops.push_back(ConstantInt::get(IdxTy, II)); + // Keep track of the type we're currently indexing. + if (auto *ElPTy = dyn_cast<PointerType>(ElTy)) + ElTy = ElPTy->getElementType(); + else + ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II); + } + // And create a GEP to extract those indices. + V = GetElementPtrInst::Create(ArgIndex.first, V, Ops, + V->getName() + ".idx", Call); + Ops.clear(); + } + // Since we're replacing a load make sure we take the alignment + // of the previous load. + LoadInst *newLoad = new LoadInst(V, V->getName() + ".val", Call); + newLoad->setAlignment(OrigLoad->getAlignment()); + // Transfer the AA info too. + AAMDNodes AAInfo; + OrigLoad->getAAMetadata(AAInfo); + newLoad->setAAMetadata(AAInfo); - return true; -} + Args.push_back(newLoad); + ArgAttrVec.push_back(AttributeSet()); + } + } -/// \brief Checks if the padding bytes of an argument could be accessed. -static bool canPaddingBeAccessed(Argument *arg) { + // Push any varargs arguments on the list. + for (; AI != CS.arg_end(); ++AI, ++ArgIndex) { + Args.push_back(*AI); + ArgAttrVec.push_back(CallPAL.getAttributes(ArgIndex)); + } - assert(arg->hasByValAttr()); + SmallVector<OperandBundleDef, 1> OpBundles; + CS.getOperandBundlesAsDefs(OpBundles); - // Track all the pointers to the argument to make sure they are not captured. - SmallPtrSet<Value *, 16> PtrValues; - PtrValues.insert(arg); + CallSite NewCS; + if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles, "", Call); + } else { + auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", Call); + NewCall->setTailCallKind(cast<CallInst>(Call)->getTailCallKind()); + NewCS = NewCall; + } + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes( + AttributeList::get(F->getContext(), CallPAL.getFnAttributes(), + CallPAL.getRetAttributes(), ArgAttrVec)); + NewCS->setDebugLoc(Call->getDebugLoc()); + uint64_t W; + if (Call->extractProfTotalWeight(W)) + NewCS->setProfWeight(W); + Args.clear(); + ArgAttrVec.clear(); - // Track all of the stores. - SmallVector<StoreInst *, 16> Stores; + // Update the callgraph to know that the callsite has been transformed. + if (ReplaceCallSite) + (*ReplaceCallSite)(CS, NewCS); - // Scan through the uses recursively to make sure the pointer is always used - // sanely. - SmallVector<Value *, 16> WorkList; - WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end()); - while (!WorkList.empty()) { - Value *V = WorkList.back(); - WorkList.pop_back(); - if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) { - if (PtrValues.insert(V).second) - WorkList.insert(WorkList.end(), V->user_begin(), V->user_end()); - } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) { - Stores.push_back(Store); - } else if (!isa<LoadInst>(V)) { - return true; + if (!Call->use_empty()) { + Call->replaceAllUsesWith(NewCS.getInstruction()); + NewCS->takeName(Call); } - } -// Check to make sure the pointers aren't captured - for (StoreInst *Store : Stores) - if (PtrValues.count(Store->getValueOperand())) - return true; - - return false; -} + // Finally, remove the old call from the program, reducing the use-count of + // F. + Call->eraseFromParent(); + } -/// PromoteArguments - This method checks the specified function to see if there -/// are any promotable arguments and if it is safe to promote the function (for -/// example, all callers are direct). If safe to promote some arguments, it -/// calls the DoPromotion method. -/// -static CallGraphNode * -PromoteArguments(CallGraphNode *CGN, CallGraph &CG, - function_ref<AAResults &(Function &F)> AARGetter, - unsigned MaxElements) { - Function *F = CGN->getFunction(); + const DataLayout &DL = F->getParent()->getDataLayout(); - // Make sure that it is local to this module. - if (!F || !F->hasLocalLinkage()) return nullptr; + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); - // Don't promote arguments for variadic functions. Adding, removing, or - // changing non-pack parameters can change the classification of pack - // parameters. Frontends encode that classification at the call site in the - // IR, while in the callee the classification is determined dynamically based - // on the number of registers consumed so far. - if (F->isVarArg()) return nullptr; + // Loop over the argument list, transferring uses of the old arguments over to + // the new arguments, also transferring over the names as well. + // + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), + I2 = NF->arg_begin(); + I != E; ++I) { + if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { + // If this is an unmodified argument, move the name and users over to the + // new version. + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); + ++I2; + continue; + } - // First check: see if there are any pointer arguments! If not, quick exit. - SmallVector<Argument*, 16> PointerArgs; - for (Argument &I : F->args()) - if (I.getType()->isPointerTy()) - PointerArgs.push_back(&I); - if (PointerArgs.empty()) return nullptr; + if (ByValArgsToTransform.count(&*I)) { + // In the callee, we create an alloca, and store each of the new incoming + // arguments into the alloca. + Instruction *InsertPt = &NF->begin()->front(); - // Second check: make sure that all callers are direct callers. We can't - // transform functions that have indirect callers. Also see if the function - // is self-recursive. - bool isSelfRecursive = false; - for (Use &U : F->uses()) { - CallSite CS(U.getUser()); - // Must be a direct call. - if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) return nullptr; - - if (CS.getInstruction()->getParent()->getParent() == F) - isSelfRecursive = true; - } - - const DataLayout &DL = F->getParent()->getDataLayout(); + // Just add all the struct element types. + Type *AgTy = cast<PointerType>(I->getType())->getElementType(); + Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr, + "", InsertPt); + StructType *STy = cast<StructType>(AgTy); + Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), + nullptr}; - AAResults &AAR = AARGetter(*F); + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); + Value *Idx = GetElementPtrInst::Create( + AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), + InsertPt); + I2->setName(I->getName() + "." + Twine(i)); + new StoreInst(&*I2++, Idx, InsertPt); + } - // Check to see which arguments are promotable. If an argument is promotable, - // add it to ArgsToPromote. - SmallPtrSet<Argument*, 8> ArgsToPromote; - SmallPtrSet<Argument*, 8> ByValArgsToTransform; - for (Argument *PtrArg : PointerArgs) { - Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); + // Anything that used the arg should now use the alloca. + I->replaceAllUsesWith(TheAlloca); + TheAlloca->takeName(&*I); - // Replace sret attribute with noalias. This reduces register pressure by - // avoiding a register copy. - if (PtrArg->hasStructRetAttr()) { - unsigned ArgNo = PtrArg->getArgNo(); - F->setAttributes( - F->getAttributes() - .removeAttribute(F->getContext(), ArgNo + 1, Attribute::StructRet) - .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias)); - for (Use &U : F->uses()) { - CallSite CS(U.getUser()); - CS.setAttributes( - CS.getAttributes() - .removeAttribute(F->getContext(), ArgNo + 1, - Attribute::StructRet) - .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias)); + // If the alloca is used in a call, we must clear the tail flag since + // the callee now uses an alloca from the caller. + for (User *U : TheAlloca->users()) { + CallInst *Call = dyn_cast<CallInst>(U); + if (!Call) + continue; + Call->setTailCall(false); } + continue; } - // If this is a byval argument, and if the aggregate type is small, just - // pass the elements, which is always safe, if the passed value is densely - // packed or if we can prove the padding bytes are never accessed. This does - // not apply to inalloca. - bool isSafeToPromote = - PtrArg->hasByValAttr() && - (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg)); - if (isSafeToPromote) { - if (StructType *STy = dyn_cast<StructType>(AgTy)) { - if (MaxElements > 0 && STy->getNumElements() > MaxElements) { - DEBUG(dbgs() << "argpromotion disable promoting argument '" - << PtrArg->getName() << "' because it would require adding more" - << " than " << MaxElements << " arguments to the function.\n"); - continue; - } - - // If all the elements are single-value types, we can promote it. - bool AllSimple = true; - for (const auto *EltTy : STy->elements()) { - if (!EltTy->isSingleValueType()) { - AllSimple = false; - break; - } + if (I->use_empty()) + continue; + + // Otherwise, if we promoted this argument, then all users are load + // instructions (or GEPs with only load users), and all loads should be + // using the new argument that we added. + ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; + + while (!I->use_empty()) { + if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { + assert(ArgIndices.begin()->second.empty() && + "Load element should sort to front!"); + I2->setName(I->getName() + ".val"); + LI->replaceAllUsesWith(&*I2); + LI->eraseFromParent(); + DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() + << "' in function '" << F->getName() << "'\n"); + } else { + GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back()); + IndicesVector Operands; + Operands.reserve(GEP->getNumIndices()); + for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); + II != IE; ++II) + Operands.push_back(cast<ConstantInt>(*II)->getSExtValue()); + + // GEPs with a single 0 index can be merged with direct loads + if (Operands.size() == 1 && Operands.front() == 0) + Operands.clear(); + + Function::arg_iterator TheArg = I2; + for (ScalarizeTable::iterator It = ArgIndices.begin(); + It->second != Operands; ++It, ++TheArg) { + assert(It != ArgIndices.end() && "GEP not handled??"); } - // Safe to transform, don't even bother trying to "promote" it. - // Passing the elements as a scalar will allow sroa to hack on - // the new alloca we introduce. - if (AllSimple) { - ByValArgsToTransform.insert(PtrArg); - continue; + std::string NewName = I->getName(); + for (unsigned i = 0, e = Operands.size(); i != e; ++i) { + NewName += "." + utostr(Operands[i]); } - } - } + NewName += ".val"; + TheArg->setName(NewName); - // If the argument is a recursive type and we're in a recursive - // function, we could end up infinitely peeling the function argument. - if (isSelfRecursive) { - if (StructType *STy = dyn_cast<StructType>(AgTy)) { - bool RecursiveType = false; - for (const auto *EltTy : STy->elements()) { - if (EltTy == PtrArg->getType()) { - RecursiveType = true; - break; - } + DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName() + << "' of function '" << NF->getName() << "'\n"); + + // All of the uses must be load instructions. Replace them all with + // the argument specified by ArgNo. + while (!GEP->use_empty()) { + LoadInst *L = cast<LoadInst>(GEP->user_back()); + L->replaceAllUsesWith(&*TheArg); + L->eraseFromParent(); } - if (RecursiveType) - continue; + GEP->eraseFromParent(); } } - - // Otherwise, see if we can promote the pointer to its value. - if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR, - MaxElements)) - ArgsToPromote.insert(PtrArg); - } - // No promotable pointer arguments. - if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) - return nullptr; + // Increment I2 past all of the arguments added for this promoted pointer. + std::advance(I2, ArgIndices.size()); + } - return DoPromotion(F, ArgsToPromote, ByValArgsToTransform, CG); + return NF; } /// AllCallersPassInValidPointerForArgument - Return true if we can prove that /// all callees pass in a valid pointer for the specified function argument. -static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { +static bool allCallersPassInValidPointerForArgument(Argument *Arg) { Function *Callee = Arg->getParent(); const DataLayout &DL = Callee->getParent()->getDataLayout(); @@ -390,26 +469,25 @@ static bool AllCallersPassInValidPointerForArgument(Argument *Arg) { /// elements in Prefix is the same as the corresponding elements in Longer. /// /// This means it also returns true when Prefix and Longer are equal! -static bool IsPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) { +static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) { if (Prefix.size() > Longer.size()) return false; return std::equal(Prefix.begin(), Prefix.end(), Longer.begin()); } - /// Checks if Indices, or a prefix of Indices, is in Set. -static bool PrefixIn(const IndicesVector &Indices, +static bool prefixIn(const IndicesVector &Indices, std::set<IndicesVector> &Set) { - std::set<IndicesVector>::iterator Low; - Low = Set.upper_bound(Indices); - if (Low != Set.begin()) - Low--; - // Low is now the last element smaller than or equal to Indices. This means - // it points to a prefix of Indices (possibly Indices itself), if such - // prefix exists. - // - // This load is safe if any prefix of its operands is safe to load. - return Low != Set.end() && IsPrefix(*Low, Indices); + std::set<IndicesVector>::iterator Low; + Low = Set.upper_bound(Indices); + if (Low != Set.begin()) + Low--; + // Low is now the last element smaller than or equal to Indices. This means + // it points to a prefix of Indices (possibly Indices itself), if such + // prefix exists. + // + // This load is safe if any prefix of its operands is safe to load. + return Low != Set.end() && isPrefix(*Low, Indices); } /// Mark the given indices (ToMark) as safe in the given set of indices @@ -417,7 +495,7 @@ static bool PrefixIn(const IndicesVector &Indices, /// is already a prefix of Indices in Safe, Indices are implicitely marked safe /// already. Furthermore, any indices that Indices is itself a prefix of, are /// removed from Safe (since they are implicitely safe because of Indices now). -static void MarkIndicesSafe(const IndicesVector &ToMark, +static void markIndicesSafe(const IndicesVector &ToMark, std::set<IndicesVector> &Safe) { std::set<IndicesVector>::iterator Low; Low = Safe.upper_bound(ToMark); @@ -428,7 +506,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark, // means it points to a prefix of Indices (possibly Indices itself), if // such prefix exists. if (Low != Safe.end()) { - if (IsPrefix(*Low, ToMark)) + if (isPrefix(*Low, ToMark)) // If there is already a prefix of these indices (or exactly these // indices) marked a safe, don't bother adding these indices return; @@ -441,7 +519,7 @@ static void MarkIndicesSafe(const IndicesVector &ToMark, ++Low; // If there we're a prefix of longer index list(s), remove those std::set<IndicesVector>::iterator End = Safe.end(); - while (Low != End && IsPrefix(ToMark, *Low)) { + while (Low != End && isPrefix(ToMark, *Low)) { std::set<IndicesVector>::iterator Remove = Low; ++Low; Safe.erase(Remove); @@ -486,7 +564,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, GEPIndicesSet ToPromote; // If the pointer is always valid, any load with first index 0 is valid. - if (isByValOrInAlloca || AllCallersPassInValidPointerForArgument(Arg)) + if (isByValOrInAlloca || allCallersPassInValidPointerForArgument(Arg)) SafeToUnconditionallyLoad.insert(IndicesVector(1, 0)); // First, iterate the entry block and mark loads of (geps of) arguments as @@ -512,25 +590,26 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, return false; // Indices checked out, mark them as safe - MarkIndicesSafe(Indices, SafeToUnconditionallyLoad); + markIndicesSafe(Indices, SafeToUnconditionallyLoad); Indices.clear(); } } else if (V == Arg) { // Direct loads are equivalent to a GEP with a single 0 index. - MarkIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad); + markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad); } } // Now, iterate all uses of the argument to see if there are any uses that are // not (GEP+)loads, or any (GEP+)loads that are not safe to promote. - SmallVector<LoadInst*, 16> Loads; + SmallVector<LoadInst *, 16> Loads; IndicesVector Operands; for (Use &U : Arg->uses()) { User *UR = U.getUser(); Operands.clear(); if (LoadInst *LI = dyn_cast<LoadInst>(UR)) { // Don't hack volatile/atomic loads - if (!LI->isSimple()) return false; + if (!LI->isSimple()) + return false; Loads.push_back(LI); // Direct loads are equivalent to a GEP with a zero index and then a load. Operands.push_back(0); @@ -547,30 +626,31 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, } // Ensure that all of the indices are constants. - for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); - i != e; ++i) + for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e; + ++i) if (ConstantInt *C = dyn_cast<ConstantInt>(*i)) Operands.push_back(C->getSExtValue()); else - return false; // Not a constant operand GEP! + return false; // Not a constant operand GEP! // Ensure that the only users of the GEP are load instructions. for (User *GEPU : GEP->users()) if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) { // Don't hack volatile/atomic loads - if (!LI->isSimple()) return false; + if (!LI->isSimple()) + return false; Loads.push_back(LI); } else { // Other uses than load? return false; } } else { - return false; // Not a load or a GEP. + return false; // Not a load or a GEP. } // Now, see if it is safe to promote this load / loads of this GEP. Loading // is safe if Operands, or a prefix of Operands, is marked as safe. - if (!PrefixIn(Operands, SafeToUnconditionallyLoad)) + if (!prefixIn(Operands, SafeToUnconditionallyLoad)) return false; // See if we are already promoting a load with these indices. If not, check @@ -579,8 +659,10 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, if (ToPromote.find(Operands) == ToPromote.end()) { if (MaxElements > 0 && ToPromote.size() == MaxElements) { DEBUG(dbgs() << "argpromotion not promoting argument '" - << Arg->getName() << "' because it would require adding more " - << "than " << MaxElements << " arguments to the function.\n"); + << Arg->getName() + << "' because it would require adding more " + << "than " << MaxElements + << " arguments to the function.\n"); // We limit aggregate promotion to only promoting up to a fixed number // of elements of the aggregate. return false; @@ -589,7 +671,8 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, } } - if (Loads.empty()) return true; // No users, this is a dead argument. + if (Loads.empty()) + return true; // No users, this is a dead argument. // Okay, now we know that the argument is only used by load instructions and // it is safe to unconditionally perform all of them. Use alias analysis to @@ -598,7 +681,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, // Because there could be several/many load instructions, remember which // blocks we know to be transparent to the load. - df_iterator_default_set<BasicBlock*, 16> TranspBlocks; + df_iterator_default_set<BasicBlock *, 16> TranspBlocks; for (LoadInst *Load : Loads) { // Check to see if the load is invalidated from the start of the block to @@ -607,7 +690,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, MemoryLocation Loc = MemoryLocation::get(Load); if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod)) - return false; // Pointer is invalidated! + return false; // Pointer is invalidated! // Now check every path from the entry block to the load for transparency. // To do this, we perform a depth first search on the inverse CFG from the @@ -625,416 +708,352 @@ static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca, return true; } -/// DoPromotion - This method actually performs the promotion of the specified -/// arguments, and returns the new function. At this point, we know that it's -/// safe to do so. -static CallGraphNode * -DoPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, - SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG) { +/// \brief Checks if a type could have padding bytes. +static bool isDenselyPacked(Type *type, const DataLayout &DL) { - // Start by computing a new prototype for the function, which is the same as - // the old function, but has modified arguments. - FunctionType *FTy = F->getFunctionType(); - std::vector<Type*> Params; + // There is no size information, so be conservative. + if (!type->isSized()) + return false; - typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable; + // If the alloc size is not equal to the storage size, then there are padding + // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128. + if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type)) + return false; - // ScalarizedElements - If we are promoting a pointer that has elements - // accessed out of it, keep track of which elements are accessed so that we - // can add one argument for each. - // - // Arguments that are directly loaded will have a zero element value here, to - // handle cases where there are both a direct load and GEP accesses. - // - std::map<Argument*, ScalarizeTable> ScalarizedElements; + if (!isa<CompositeType>(type)) + return true; - // OriginalLoads - Keep track of a representative load instruction from the - // original function so that we can tell the alias analysis implementation - // what the new GEP/Load instructions we are inserting look like. - // We need to keep the original loads for each argument and the elements - // of the argument that are accessed. - std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads; + // For homogenous sequential types, check for padding within members. + if (SequentialType *seqTy = dyn_cast<SequentialType>(type)) + return isDenselyPacked(seqTy->getElementType(), DL); - // Attribute - Keep track of the parameter attributes for the arguments - // that we are *not* promoting. For the ones that we do promote, the parameter - // attributes are lost - SmallVector<AttributeSet, 8> AttributesVec; - const AttributeSet &PAL = F->getAttributes(); + // Check for padding within and between elements of a struct. + StructType *StructTy = cast<StructType>(type); + const StructLayout *Layout = DL.getStructLayout(StructTy); + uint64_t StartPos = 0; + for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) { + Type *ElTy = StructTy->getElementType(i); + if (!isDenselyPacked(ElTy, DL)) + return false; + if (StartPos != Layout->getElementOffsetInBits(i)) + return false; + StartPos += DL.getTypeAllocSizeInBits(ElTy); + } - // Add any return attributes. - if (PAL.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(F->getContext(), - PAL.getRetAttributes())); + return true; +} - // First, determine the new argument list - unsigned ArgIndex = 1; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; - ++I, ++ArgIndex) { - if (ByValArgsToTransform.count(&*I)) { - // Simple byval argument? Just add all the struct element types. - Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - StructType *STy = cast<StructType>(AgTy); - Params.insert(Params.end(), STy->element_begin(), STy->element_end()); - ++NumByValArgsPromoted; - } else if (!ArgsToPromote.count(&*I)) { - // Unchanged argument - Params.push_back(I->getType()); - AttributeSet attrs = PAL.getParamAttributes(ArgIndex); - if (attrs.hasAttributes(ArgIndex)) { - AttrBuilder B(attrs, ArgIndex); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Params.size(), B)); - } - } else if (I->use_empty()) { - // Dead argument (which are always marked as promotable) - ++NumArgumentsDead; - } else { - // Okay, this is being promoted. This means that the only uses are loads - // or GEPs which are only used by loads +/// \brief Checks if the padding bytes of an argument could be accessed. +static bool canPaddingBeAccessed(Argument *arg) { - // In this table, we will track which indices are loaded from the argument - // (where direct loads are tracked as no indices). - ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; - for (User *U : I->users()) { - Instruction *UI = cast<Instruction>(U); - Type *SrcTy; - if (LoadInst *L = dyn_cast<LoadInst>(UI)) - SrcTy = L->getType(); - else - SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType(); - IndicesVector Indices; - Indices.reserve(UI->getNumOperands() - 1); - // Since loads will only have a single operand, and GEPs only a single - // non-index operand, this will record direct loads without any indices, - // and gep+loads with the GEP indices. - for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end(); - II != IE; ++II) - Indices.push_back(cast<ConstantInt>(*II)->getSExtValue()); - // GEPs with a single 0 index can be merged with direct loads - if (Indices.size() == 1 && Indices.front() == 0) - Indices.clear(); - ArgIndices.insert(std::make_pair(SrcTy, Indices)); - LoadInst *OrigLoad; - if (LoadInst *L = dyn_cast<LoadInst>(UI)) - OrigLoad = L; - else - // Take any load, we will use it only to update Alias Analysis - OrigLoad = cast<LoadInst>(UI->user_back()); - OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad; - } + assert(arg->hasByValAttr()); - // Add a parameter to the function for each element passed in. - for (const auto &ArgIndex : ArgIndices) { - // not allowed to dereference ->begin() if size() is 0 - Params.push_back(GetElementPtrInst::getIndexedType( - cast<PointerType>(I->getType()->getScalarType())->getElementType(), - ArgIndex.second)); - assert(Params.back()); - } + // Track all the pointers to the argument to make sure they are not captured. + SmallPtrSet<Value *, 16> PtrValues; + PtrValues.insert(arg); - if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty()) - ++NumArgumentsPromoted; - else - ++NumAggregatesPromoted; + // Track all of the stores. + SmallVector<StoreInst *, 16> Stores; + + // Scan through the uses recursively to make sure the pointer is always used + // sanely. + SmallVector<Value *, 16> WorkList; + WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end()); + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) { + if (PtrValues.insert(V).second) + WorkList.insert(WorkList.end(), V->user_begin(), V->user_end()); + } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) { + Stores.push_back(Store); + } else if (!isa<LoadInst>(V)) { + return true; } } - // Add any function attributes. - if (PAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(FTy->getContext(), - PAL.getFnAttributes())); + // Check to make sure the pointers aren't captured + for (StoreInst *Store : Stores) + if (PtrValues.count(Store->getValueOperand())) + return true; - Type *RetTy = FTy->getReturnType(); + return false; +} - // Construct the new function type using the new arguments. - FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); +/// PromoteArguments - This method checks the specified function to see if there +/// are any promotable arguments and if it is safe to promote the function (for +/// example, all callers are direct). If safe to promote some arguments, it +/// calls the DoPromotion method. +/// +static Function * +promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter, + unsigned MaxElements, + Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>> + ReplaceCallSite) { + // Make sure that it is local to this module. + if (!F->hasLocalLinkage()) + return nullptr; - // Create the new function body and insert it into the module. - Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); - NF->copyAttributesFrom(F); + // Don't promote arguments for variadic functions. Adding, removing, or + // changing non-pack parameters can change the classification of pack + // parameters. Frontends encode that classification at the call site in the + // IR, while in the callee the classification is determined dynamically based + // on the number of registers consumed so far. + if (F->isVarArg()) + return nullptr; - // Patch the pointer to LLVM function in debug info descriptor. - NF->setSubprogram(F->getSubprogram()); - F->setSubprogram(nullptr); + // First check: see if there are any pointer arguments! If not, quick exit. + SmallVector<Argument *, 16> PointerArgs; + for (Argument &I : F->args()) + if (I.getType()->isPointerTy()) + PointerArgs.push_back(&I); + if (PointerArgs.empty()) + return nullptr; - DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n" - << "From: " << *F); - - // Recompute the parameter attributes list based on the new arguments for - // the function. - NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec)); - AttributesVec.clear(); + // Second check: make sure that all callers are direct callers. We can't + // transform functions that have indirect callers. Also see if the function + // is self-recursive. + bool isSelfRecursive = false; + for (Use &U : F->uses()) { + CallSite CS(U.getUser()); + // Must be a direct call. + if (CS.getInstruction() == nullptr || !CS.isCallee(&U)) + return nullptr; - F->getParent()->getFunctionList().insert(F->getIterator(), NF); - NF->takeName(F); + if (CS.getInstruction()->getParent()->getParent() == F) + isSelfRecursive = true; + } - // Get a new callgraph node for NF. - CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); + const DataLayout &DL = F->getParent()->getDataLayout(); - // Loop over all of the callers of the function, transforming the call sites - // to pass in the loaded pointers. - // - SmallVector<Value*, 16> Args; - while (!F->use_empty()) { - CallSite CS(F->user_back()); - assert(CS.getCalledFunction() == F); - Instruction *Call = CS.getInstruction(); - const AttributeSet &CallPAL = CS.getAttributes(); + AAResults &AAR = AARGetter(*F); - // Add any return attributes. - if (CallPAL.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(F->getContext(), - CallPAL.getRetAttributes())); + // Check to see which arguments are promotable. If an argument is promotable, + // add it to ArgsToPromote. + SmallPtrSet<Argument *, 8> ArgsToPromote; + SmallPtrSet<Argument *, 8> ByValArgsToTransform; + for (Argument *PtrArg : PointerArgs) { + Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType(); - // Loop over the operands, inserting GEP and loads in the caller as - // appropriate. - CallSite::arg_iterator AI = CS.arg_begin(); - ArgIndex = 1; - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I, ++AI, ++ArgIndex) - if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { - Args.push_back(*AI); // Unmodified argument + // Replace sret attribute with noalias. This reduces register pressure by + // avoiding a register copy. + if (PtrArg->hasStructRetAttr()) { + unsigned ArgNo = PtrArg->getArgNo(); + F->setAttributes( + F->getAttributes() + .removeAttribute(F->getContext(), ArgNo + 1, Attribute::StructRet) + .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias)); + for (Use &U : F->uses()) { + CallSite CS(U.getUser()); + CS.setAttributes( + CS.getAttributes() + .removeAttribute(F->getContext(), ArgNo + 1, + Attribute::StructRet) + .addAttribute(F->getContext(), ArgNo + 1, Attribute::NoAlias)); + } + } - if (CallPAL.hasAttributes(ArgIndex)) { - AttrBuilder B(CallPAL, ArgIndex); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); - } - } else if (ByValArgsToTransform.count(&*I)) { - // Emit a GEP and load for each element of the struct. - Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - StructType *STy = cast<StructType>(AgTy); - Value *Idxs[2] = { - ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr }; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); - Value *Idx = GetElementPtrInst::Create( - STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call); - // TODO: Tell AA about the new values? - Args.push_back(new LoadInst(Idx, Idx->getName()+".val", Call)); + // If this is a byval argument, and if the aggregate type is small, just + // pass the elements, which is always safe, if the passed value is densely + // packed or if we can prove the padding bytes are never accessed. This does + // not apply to inalloca. + bool isSafeToPromote = + PtrArg->hasByValAttr() && + (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg)); + if (isSafeToPromote) { + if (StructType *STy = dyn_cast<StructType>(AgTy)) { + if (MaxElements > 0 && STy->getNumElements() > MaxElements) { + DEBUG(dbgs() << "argpromotion disable promoting argument '" + << PtrArg->getName() + << "' because it would require adding more" + << " than " << MaxElements + << " arguments to the function.\n"); + continue; } - } else if (!I->use_empty()) { - // Non-dead argument: insert GEPs and loads as appropriate. - ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; - // Store the Value* version of the indices in here, but declare it now - // for reuse. - std::vector<Value*> Ops; - for (const auto &ArgIndex : ArgIndices) { - Value *V = *AI; - LoadInst *OrigLoad = - OriginalLoads[std::make_pair(&*I, ArgIndex.second)]; - if (!ArgIndex.second.empty()) { - Ops.reserve(ArgIndex.second.size()); - Type *ElTy = V->getType(); - for (unsigned long II : ArgIndex.second) { - // Use i32 to index structs, and i64 for others (pointers/arrays). - // This satisfies GEP constraints. - Type *IdxTy = (ElTy->isStructTy() ? - Type::getInt32Ty(F->getContext()) : - Type::getInt64Ty(F->getContext())); - Ops.push_back(ConstantInt::get(IdxTy, II)); - // Keep track of the type we're currently indexing. - if (auto *ElPTy = dyn_cast<PointerType>(ElTy)) - ElTy = ElPTy->getElementType(); - else - ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II); - } - // And create a GEP to extract those indices. - V = GetElementPtrInst::Create(ArgIndex.first, V, Ops, - V->getName() + ".idx", Call); - Ops.clear(); + + // If all the elements are single-value types, we can promote it. + bool AllSimple = true; + for (const auto *EltTy : STy->elements()) { + if (!EltTy->isSingleValueType()) { + AllSimple = false; + break; } - // Since we're replacing a load make sure we take the alignment - // of the previous load. - LoadInst *newLoad = new LoadInst(V, V->getName()+".val", Call); - newLoad->setAlignment(OrigLoad->getAlignment()); - // Transfer the AA info too. - AAMDNodes AAInfo; - OrigLoad->getAAMetadata(AAInfo); - newLoad->setAAMetadata(AAInfo); + } - Args.push_back(newLoad); + // Safe to transform, don't even bother trying to "promote" it. + // Passing the elements as a scalar will allow sroa to hack on + // the new alloca we introduce. + if (AllSimple) { + ByValArgsToTransform.insert(PtrArg); + continue; } } + } - // Push any varargs arguments on the list. - for (; AI != CS.arg_end(); ++AI, ++ArgIndex) { - Args.push_back(*AI); - if (CallPAL.hasAttributes(ArgIndex)) { - AttrBuilder B(CallPAL, ArgIndex); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); + // If the argument is a recursive type and we're in a recursive + // function, we could end up infinitely peeling the function argument. + if (isSelfRecursive) { + if (StructType *STy = dyn_cast<StructType>(AgTy)) { + bool RecursiveType = false; + for (const auto *EltTy : STy->elements()) { + if (EltTy == PtrArg->getType()) { + RecursiveType = true; + break; + } + } + if (RecursiveType) + continue; } } - // Add any function attributes. - if (CallPAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(Call->getContext(), - CallPAL.getFnAttributes())); + // Otherwise, see if we can promote the pointer to its value. + if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR, + MaxElements)) + ArgsToPromote.insert(PtrArg); + } + + // No promotable pointer arguments. + if (ArgsToPromote.empty() && ByValArgsToTransform.empty()) + return nullptr; - SmallVector<OperandBundleDef, 1> OpBundles; - CS.getOperandBundlesAsDefs(OpBundles); + return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite); +} - Instruction *New; - if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, OpBundles, "", Call); - cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(), - AttributesVec)); - } else { - New = CallInst::Create(NF, Args, OpBundles, "", Call); - cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(), - AttributesVec)); - cast<CallInst>(New)->setTailCallKind( - cast<CallInst>(Call)->getTailCallKind()); - } - New->setDebugLoc(Call->getDebugLoc()); - Args.clear(); - AttributesVec.clear(); +PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, + CGSCCAnalysisManager &AM, + LazyCallGraph &CG, + CGSCCUpdateResult &UR) { + bool Changed = false, LocalChange; - // Update the callgraph to know that the callsite has been transformed. - CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()]; - CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN); + // Iterate until we stop promoting from this SCC. + do { + LocalChange = false; - if (!Call->use_empty()) { - Call->replaceAllUsesWith(New); - New->takeName(Call); + for (LazyCallGraph::Node &N : C) { + Function &OldF = N.getFunction(); + + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager(); + // FIXME: This lambda must only be used with this function. We should + // skip the lambda and just get the AA results directly. + auto AARGetter = [&](Function &F) -> AAResults & { + assert(&F == &OldF && "Called with an unexpected function!"); + return FAM.getResult<AAManager>(F); + }; + + Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None); + if (!NewF) + continue; + LocalChange = true; + + // Directly substitute the functions in the call graph. Note that this + // requires the old function to be completely dead and completely + // replaced by the new function. It does no call graph updates, it merely + // swaps out the particular function mapped to a particular node in the + // graph. + C.getOuterRefSCC().replaceNodeFunction(N, *NewF); + OldF.eraseFromParent(); } - // Finally, remove the old call from the program, reducing the use-count of - // F. - Call->eraseFromParent(); - } - - // Since we have now created the new function, splice the body of the old - // function right into the new function, leaving the old rotting hulk of the - // function empty. - NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + Changed |= LocalChange; + } while (LocalChange); - // Loop over the argument list, transferring uses of the old arguments over to - // the new arguments, also transferring over the names as well. - // - for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), - I2 = NF->arg_begin(); I != E; ++I) { - if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) { - // If this is an unmodified argument, move the name and users over to the - // new version. - I->replaceAllUsesWith(&*I2); - I2->takeName(&*I); - ++I2; - continue; - } + if (!Changed) + return PreservedAnalyses::all(); - if (ByValArgsToTransform.count(&*I)) { - // In the callee, we create an alloca, and store each of the new incoming - // arguments into the alloca. - Instruction *InsertPt = &NF->begin()->front(); + return PreservedAnalyses::none(); +} - // Just add all the struct element types. - Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - Value *TheAlloca = new AllocaInst(AgTy, nullptr, "", InsertPt); - StructType *STy = cast<StructType>(AgTy); - Value *Idxs[2] = { - ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr }; +namespace { +/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass. +/// +struct ArgPromotion : public CallGraphSCCPass { + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + getAAResultsAnalysisUsage(AU); + CallGraphSCCPass::getAnalysisUsage(AU); + } - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i); - Value *Idx = GetElementPtrInst::Create( - AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i), - InsertPt); - I2->setName(I->getName()+"."+Twine(i)); - new StoreInst(&*I2++, Idx, InsertPt); - } + bool runOnSCC(CallGraphSCC &SCC) override; + static char ID; // Pass identification, replacement for typeid + explicit ArgPromotion(unsigned MaxElements = 3) + : CallGraphSCCPass(ID), MaxElements(MaxElements) { + initializeArgPromotionPass(*PassRegistry::getPassRegistry()); + } - // Anything that used the arg should now use the alloca. - I->replaceAllUsesWith(TheAlloca); - TheAlloca->takeName(&*I); +private: + using llvm::Pass::doInitialization; + bool doInitialization(CallGraph &CG) override; + /// The maximum number of elements to expand, or 0 for unlimited. + unsigned MaxElements; +}; +} - // If the alloca is used in a call, we must clear the tail flag since - // the callee now uses an alloca from the caller. - for (User *U : TheAlloca->users()) { - CallInst *Call = dyn_cast<CallInst>(U); - if (!Call) - continue; - Call->setTailCall(false); - } - continue; - } +char ArgPromotion::ID = 0; +INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion", + "Promote 'by reference' arguments to scalars", false, + false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(ArgPromotion, "argpromotion", + "Promote 'by reference' arguments to scalars", false, false) - if (I->use_empty()) - continue; +Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) { + return new ArgPromotion(MaxElements); +} - // Otherwise, if we promoted this argument, then all users are load - // instructions (or GEPs with only load users), and all loads should be - // using the new argument that we added. - ScalarizeTable &ArgIndices = ScalarizedElements[&*I]; +bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) { + if (skipSCC(SCC)) + return false; - while (!I->use_empty()) { - if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) { - assert(ArgIndices.begin()->second.empty() && - "Load element should sort to front!"); - I2->setName(I->getName()+".val"); - LI->replaceAllUsesWith(&*I2); - LI->eraseFromParent(); - DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName() - << "' in function '" << F->getName() << "'\n"); - } else { - GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back()); - IndicesVector Operands; - Operands.reserve(GEP->getNumIndices()); - for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end(); - II != IE; ++II) - Operands.push_back(cast<ConstantInt>(*II)->getSExtValue()); + // Get the callgraph information that we need to update to reflect our + // changes. + CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); - // GEPs with a single 0 index can be merged with direct loads - if (Operands.size() == 1 && Operands.front() == 0) - Operands.clear(); + LegacyAARGetter AARGetter(*this); - Function::arg_iterator TheArg = I2; - for (ScalarizeTable::iterator It = ArgIndices.begin(); - It->second != Operands; ++It, ++TheArg) { - assert(It != ArgIndices.end() && "GEP not handled??"); - } + bool Changed = false, LocalChange; - std::string NewName = I->getName(); - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - NewName += "." + utostr(Operands[i]); - } - NewName += ".val"; - TheArg->setName(NewName); + // Iterate until we stop promoting from this SCC. + do { + LocalChange = false; + // Attempt to promote arguments from all functions in this SCC. + for (CallGraphNode *OldNode : SCC) { + Function *OldF = OldNode->getFunction(); + if (!OldF) + continue; + + auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) { + Function *Caller = OldCS.getInstruction()->getParent()->getParent(); + CallGraphNode *NewCalleeNode = + CG.getOrInsertFunction(NewCS.getCalledFunction()); + CallGraphNode *CallerNode = CG[Caller]; + CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode); + }; + + if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements, + {ReplaceCallSite})) { + LocalChange = true; - DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName() - << "' of function '" << NF->getName() << "'\n"); + // Update the call graph for the newly promoted function. + CallGraphNode *NewNode = CG.getOrInsertFunction(NewF); + NewNode->stealCalledFunctionsFrom(OldNode); + if (OldNode->getNumReferences() == 0) + delete CG.removeFunctionFromModule(OldNode); + else + OldF->setLinkage(Function::ExternalLinkage); - // All of the uses must be load instructions. Replace them all with - // the argument specified by ArgNo. - while (!GEP->use_empty()) { - LoadInst *L = cast<LoadInst>(GEP->user_back()); - L->replaceAllUsesWith(&*TheArg); - L->eraseFromParent(); - } - GEP->eraseFromParent(); + // And updat ethe SCC we're iterating as well. + SCC.ReplaceNode(OldNode, NewNode); } } + // Remember that we changed something. + Changed |= LocalChange; + } while (LocalChange); - // Increment I2 past all of the arguments added for this promoted pointer. - std::advance(I2, ArgIndices.size()); - } - - NF_CGN->stealCalledFunctionsFrom(CG[F]); - - // Now that the old function is dead, delete it. If there is a dangling - // reference to the CallgraphNode, just leave the dead function around for - // someone else to nuke. - CallGraphNode *CGN = CG[F]; - if (CGN->getNumReferences() == 0) - delete CG.removeFunctionFromModule(CGN); - else - F->setLinkage(Function::ExternalLinkage); - - return NF_CGN; + return Changed; } bool ArgPromotion::doInitialization(CallGraph &CG) { diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp index d75ed206ad23..62b5a9c9ba26 100644 --- a/lib/Transforms/IPO/ConstantMerge.cpp +++ b/lib/Transforms/IPO/ConstantMerge.cpp @@ -60,6 +60,23 @@ static bool IsBetterCanonical(const GlobalVariable &A, return A.hasGlobalUnnamedAddr(); } +static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) { + SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; + GV->getAllMetadata(MDs); + for (const auto &V : MDs) + if (V.first != LLVMContext::MD_dbg) + return true; + return false; +} + +static void copyDebugLocMetadata(const GlobalVariable *From, + GlobalVariable *To) { + SmallVector<DIGlobalVariableExpression *, 1> MDs; + From->getDebugInfo(MDs); + for (auto MD : MDs) + To->addDebugInfo(MD); +} + static unsigned getAlignment(GlobalVariable *GV) { unsigned Align = GV->getAlignment(); if (Align) @@ -113,6 +130,10 @@ static bool mergeConstants(Module &M) { if (GV->isWeakForLinker()) continue; + // Don't touch globals with metadata other then !dbg. + if (hasMetadataOtherThanDebugLoc(GV)) + continue; + Constant *Init = GV->getInitializer(); // Check to see if the initializer is already known. @@ -155,6 +176,9 @@ static bool mergeConstants(Module &M) { if (!Slot->hasGlobalUnnamedAddr() && !GV->hasGlobalUnnamedAddr()) continue; + if (hasMetadataOtherThanDebugLoc(GV)) + continue; + if (!GV->hasGlobalUnnamedAddr()) Slot->setUnnamedAddr(GlobalValue::UnnamedAddr::None); @@ -178,6 +202,8 @@ static bool mergeConstants(Module &M) { getAlignment(Replacements[i].second))); } + copyDebugLocMetadata(Replacements[i].first, Replacements[i].second); + // Eliminate any uses of the dead global. Replacements[i].first->replaceAllUsesWith(Replacements[i].second); diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp index ba2e60dee3bc..1b111de06157 100644 --- a/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -98,8 +98,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) { LLVMContext &Ctx = M.getContext(); Constant *C = M.getOrInsertFunction( "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx), - Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx), nullptr); + Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx)); Function *F = dyn_cast<Function>(C); + // Take over the existing function. The frontend emits a weak stub so that the + // linker knows about the symbol; this pass replaces the function body. + F->deleteBody(); F->setAlignment(4096); auto args = F->arg_begin(); Value &CallSiteTypeId = *(args++); @@ -117,7 +120,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) { IRBuilder<> IRBFail(TrapBB); Constant *CFICheckFailFn = M.getOrInsertFunction( "__cfi_check_fail", Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx), - Type::getInt8PtrTy(Ctx), nullptr); + Type::getInt8PtrTy(Ctx)); IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr}); IRBFail.CreateBr(ExitBB); diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 1a5ed4692211..375b74c494d9 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -166,41 +166,43 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) { Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs); // Drop any attributes that were on the vararg arguments. - AttributeSet PAL = CS.getAttributes(); + AttributeList PAL = CS.getAttributes(); if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) { - SmallVector<AttributeSet, 8> AttributesVec; + SmallVector<AttributeList, 8> AttributesVec; for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i) AttributesVec.push_back(PAL.getSlotAttributes(i)); - if (PAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(Fn.getContext(), - PAL.getFnAttributes())); - PAL = AttributeSet::get(Fn.getContext(), AttributesVec); + if (PAL.hasAttributes(AttributeList::FunctionIndex)) + AttributesVec.push_back(AttributeList::get(Fn.getContext(), + AttributeList::FunctionIndex, + PAL.getFnAttributes())); + PAL = AttributeList::get(Fn.getContext(), AttributesVec); } SmallVector<OperandBundleDef, 1> OpBundles; CS.getOperandBundlesAsDefs(OpBundles); - Instruction *New; + CallSite NewCS; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, OpBundles, "", Call); - cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(PAL); + NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles, "", Call); } else { - New = CallInst::Create(NF, Args, OpBundles, "", Call); - cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(PAL); - cast<CallInst>(New)->setTailCallKind( - cast<CallInst>(Call)->getTailCallKind()); + NewCS = CallInst::Create(NF, Args, OpBundles, "", Call); + cast<CallInst>(NewCS.getInstruction()) + ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind()); } - New->setDebugLoc(Call->getDebugLoc()); + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes(PAL); + NewCS->setDebugLoc(Call->getDebugLoc()); + uint64_t W; + if (Call->extractProfTotalWeight(W)) + NewCS->setProfWeight(W); Args.clear(); if (!Call->use_empty()) - Call->replaceAllUsesWith(New); + Call->replaceAllUsesWith(NewCS.getInstruction()); - New->takeName(Call); + NewCS->takeName(Call); // Finally, remove the old call from the program, reducing the use-count of // F. @@ -681,8 +683,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { bool HasLiveReturnedArg = false; // Set up to build a new list of parameter attributes. - SmallVector<AttributeSet, 8> AttributesVec; - const AttributeSet &PAL = F->getAttributes(); + SmallVector<AttributeSet, 8> ArgAttrVec; + const AttributeList &PAL = F->getAttributes(); // Remember which arguments are still alive. SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false); @@ -696,16 +698,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (LiveValues.erase(Arg)) { Params.push_back(I->getType()); ArgAlive[i] = true; - - // Get the original parameter attributes (skipping the first one, that is - // for the return value. - if (PAL.hasAttributes(i + 1)) { - AttrBuilder B(PAL, i + 1); - if (B.contains(Attribute::Returned)) - HasLiveReturnedArg = true; - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Params.size(), B)); - } + ArgAttrVec.push_back(PAL.getParamAttributes(i)); + HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned); } else { ++NumArgumentsEliminated; DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i @@ -779,30 +773,24 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { assert(NRetTy && "No new return type found?"); // The existing function return attributes. - AttributeSet RAttrs = PAL.getRetAttributes(); + AttrBuilder RAttrs(PAL.getRetAttributes()); // Remove any incompatible attributes, but only if we removed all return // values. Otherwise, ensure that we don't have any conflicting attributes // here. Currently, this should not be possible, but special handling might be // required when new return value attributes are added. if (NRetTy->isVoidTy()) - RAttrs = RAttrs.removeAttributes(NRetTy->getContext(), - AttributeSet::ReturnIndex, - AttributeFuncs::typeIncompatible(NRetTy)); + RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); else - assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex). - overlaps(AttributeFuncs::typeIncompatible(NRetTy)) && + assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) && "Return attributes no longer compatible?"); - if (RAttrs.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs)); - - if (PAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(F->getContext(), - PAL.getFnAttributes())); + AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); // Reconstruct the AttributesList based on the vector we constructed. - AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec); + assert(ArgAttrVec.size() == Params.size()); + AttributeList NewPAL = AttributeList::get( + F->getContext(), PAL.getFnAttributes(), RetAttrs, ArgAttrVec); // Create the new function type based on the recomputed parameters. FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); @@ -829,18 +817,14 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { CallSite CS(F->user_back()); Instruction *Call = CS.getInstruction(); - AttributesVec.clear(); - const AttributeSet &CallPAL = CS.getAttributes(); - - // The call return attributes. - AttributeSet RAttrs = CallPAL.getRetAttributes(); + ArgAttrVec.clear(); + const AttributeList &CallPAL = CS.getAttributes(); - // Adjust in case the function was changed to return void. - RAttrs = RAttrs.removeAttributes(NRetTy->getContext(), - AttributeSet::ReturnIndex, - AttributeFuncs::typeIncompatible(NF->getReturnType())); - if (RAttrs.hasAttributes(AttributeSet::ReturnIndex)) - AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs)); + // Adjust the call return attributes in case the function was changed to + // return void. + AttrBuilder RAttrs(CallPAL.getRetAttributes()); + RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy)); + AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs); // Declare these outside of the loops, so we can reuse them for the second // loop, which loops the varargs. @@ -852,57 +836,55 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) { if (ArgAlive[i]) { Args.push_back(*I); // Get original parameter attributes, but skip return attributes. - if (CallPAL.hasAttributes(i + 1)) { - AttrBuilder B(CallPAL, i + 1); + AttributeSet Attrs = CallPAL.getParamAttributes(i); + if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) { // If the return type has changed, then get rid of 'returned' on the // call site. The alternative is to make all 'returned' attributes on // call sites keep the return value alive just like 'returned' - // attributes on function declaration but it's less clearly a win - // and this is not an expected case anyway - if (NRetTy != RetTy && B.contains(Attribute::Returned)) - B.removeAttribute(Attribute::Returned); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); + // attributes on function declaration but it's less clearly a win and + // this is not an expected case anyway + ArgAttrVec.push_back(AttributeSet::get( + F->getContext(), + AttrBuilder(Attrs).removeAttribute(Attribute::Returned))); + } else { + // Otherwise, use the original attributes. + ArgAttrVec.push_back(Attrs); } } // Push any varargs arguments on the list. Don't forget their attributes. for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) { Args.push_back(*I); - if (CallPAL.hasAttributes(i + 1)) { - AttrBuilder B(CallPAL, i + 1); - AttributesVec. - push_back(AttributeSet::get(F->getContext(), Args.size(), B)); - } + ArgAttrVec.push_back(CallPAL.getParamAttributes(i)); } - if (CallPAL.hasAttributes(AttributeSet::FunctionIndex)) - AttributesVec.push_back(AttributeSet::get(Call->getContext(), - CallPAL.getFnAttributes())); - // Reconstruct the AttributesList based on the vector we constructed. - AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec); + assert(ArgAttrVec.size() == Args.size()); + AttributeList NewCallPAL = AttributeList::get( + F->getContext(), CallPAL.getFnAttributes(), RetAttrs, ArgAttrVec); SmallVector<OperandBundleDef, 1> OpBundles; CS.getOperandBundlesAsDefs(OpBundles); - Instruction *New; + CallSite NewCS; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { - New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), - Args, OpBundles, "", Call->getParent()); - cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(NewCallPAL); + NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), + Args, OpBundles, "", Call->getParent()); } else { - New = CallInst::Create(NF, Args, OpBundles, "", Call); - cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(NewCallPAL); - cast<CallInst>(New)->setTailCallKind( - cast<CallInst>(Call)->getTailCallKind()); + NewCS = CallInst::Create(NF, Args, OpBundles, "", Call); + cast<CallInst>(NewCS.getInstruction()) + ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind()); } - New->setDebugLoc(Call->getDebugLoc()); - + NewCS.setCallingConv(CS.getCallingConv()); + NewCS.setAttributes(NewCallPAL); + NewCS->setDebugLoc(Call->getDebugLoc()); + uint64_t W; + if (Call->extractProfTotalWeight(W)) + NewCS->setProfWeight(W); Args.clear(); + ArgAttrVec.clear(); + Instruction *New = NewCS.getInstruction(); if (!Call->use_empty()) { if (New->getType() == Call->getType()) { // Return type not changed? Just replace users then. diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp index 402a66552c24..4d13b3f40688 100644 --- a/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/lib/Transforms/IPO/FunctionAttrs.cpp @@ -49,31 +49,35 @@ STATISTIC(NumNoAlias, "Number of function returns marked noalias"); STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); -namespace { -typedef SmallSetVector<Function *, 8> SCCNodeSet; -} +// FIXME: This is disabled by default to avoid exposing security vulnerabilities +// in C/C++ code compiled by clang: +// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html +static cl::opt<bool> EnableNonnullArgPropagation( + "enable-nonnull-arg-prop", cl::Hidden, + cl::desc("Try to propagate nonnull argument attributes from callsites to " + "caller functions.")); namespace { -/// The three kinds of memory access relevant to 'readonly' and -/// 'readnone' attributes. -enum MemoryAccessKind { - MAK_ReadNone = 0, - MAK_ReadOnly = 1, - MAK_MayWrite = 2 -}; +typedef SmallSetVector<Function *, 8> SCCNodeSet; } -static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR, +/// Returns the memory access attribute for function F using AAR for AA results, +/// where SCCNodes is the current SCC. +/// +/// If ThisBody is true, this function may examine the function body and will +/// return a result pertaining to this copy of the function. If it is false, the +/// result will be based only on AA results for the function declaration; it +/// will be assumed that some other (perhaps less optimized) version of the +/// function may be selected at link time. +static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody, + AAResults &AAR, const SCCNodeSet &SCCNodes) { FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F); if (MRB == FMRB_DoesNotAccessMemory) // Already perfect! return MAK_ReadNone; - // Non-exact function definitions may not be selected at link time, and an - // alternative version that writes to memory may be selected. See the comment - // on GlobalValue::isDefinitionExact for more details. - if (!F.hasExactDefinition()) { + if (!ThisBody) { if (AliasAnalysis::onlyReadsMemory(MRB)) return MAK_ReadOnly; @@ -172,9 +176,14 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR, return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone; } +MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F, + AAResults &AAR) { + return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {}); +} + /// Deduce readonly/readnone attributes for the SCC. template <typename AARGetterT> -static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { +static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) { // Check if any of the functions in the SCC read or write memory. If they // write memory then they can't be marked readnone or readonly. bool ReadsMemory = false; @@ -182,7 +191,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { // Call the callable parameter to look up AA results for this function. AAResults &AAR = AARGetter(*F); - switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) { + // Non-exact function definitions may not be selected at link time, and an + // alternative version that writes to memory may be selected. See the + // comment on GlobalValue::isDefinitionExact for more details. + switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(), + AAR, SCCNodes)) { case MAK_MayWrite: return false; case MAK_ReadOnly: @@ -212,11 +225,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) { AttrBuilder B; B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone); F->removeAttributes( - AttributeSet::FunctionIndex, - AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B)); + AttributeList::FunctionIndex, + AttributeList::get(F->getContext(), AttributeList::FunctionIndex, B)); // Add in the new attribute. - F->addAttribute(AttributeSet::FunctionIndex, + F->addAttribute(AttributeList::FunctionIndex, ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone); if (ReadsMemory) @@ -522,7 +535,7 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { if (Value *RetArg = FindRetArg()) { auto *A = cast<Argument>(RetArg); - A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); + A->addAttr(AttributeList::get(F->getContext(), A->getArgNo() + 1, B)); ++NumReturned; Changed = true; } @@ -531,6 +544,49 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) { return Changed; } +/// If a callsite has arguments that are also arguments to the parent function, +/// try to propagate attributes from the callsite's arguments to the parent's +/// arguments. This may be important because inlining can cause information loss +/// when attribute knowledge disappears with the inlined call. +static bool addArgumentAttrsFromCallsites(Function &F) { + if (!EnableNonnullArgPropagation) + return false; + + bool Changed = false; + + // For an argument attribute to transfer from a callsite to the parent, the + // call must be guaranteed to execute every time the parent is called. + // Conservatively, just check for calls in the entry block that are guaranteed + // to execute. + // TODO: This could be enhanced by testing if the callsite post-dominates the + // entry block or by doing simple forward walks or backward walks to the + // callsite. + BasicBlock &Entry = F.getEntryBlock(); + for (Instruction &I : Entry) { + if (auto CS = CallSite(&I)) { + if (auto *CalledFunc = CS.getCalledFunction()) { + for (auto &CSArg : CalledFunc->args()) { + if (!CSArg.hasNonNullAttr()) + continue; + + // If the non-null callsite argument operand is an argument to 'F' + // (the caller) and the call is guaranteed to execute, then the value + // must be non-null throughout 'F'. + auto *FArg = dyn_cast<Argument>(CS.getArgOperand(CSArg.getArgNo())); + if (FArg && !FArg->hasNonNullAttr()) { + FArg->addAttr(Attribute::NonNull); + Changed = true; + } + } + } + } + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + break; + } + + return Changed; +} + /// Deduce nocapture attributes for the SCC. static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { bool Changed = false; @@ -549,6 +605,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (!F->hasExactDefinition()) continue; + Changed |= addArgumentAttrsFromCallsites(*F); + // Functions that are readonly (or readnone) and nounwind and don't return // a value can't capture arguments. Don't analyze them. if (F->onlyReadsMemory() && F->doesNotThrow() && @@ -556,7 +614,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E; ++A) { if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) { - A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); + A->addAttr(AttributeList::get(F->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; Changed = true; } @@ -576,7 +634,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (Tracker.Uses.empty()) { // If it's trivially not captured, mark it nocapture now. A->addAttr( - AttributeSet::get(F->getContext(), A->getArgNo() + 1, B)); + AttributeList::get(F->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; Changed = true; } else { @@ -604,7 +662,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (R != Attribute::None) { AttrBuilder B; B.addAttribute(R); - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B)); Changed = true; R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; } @@ -629,7 +687,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { Argument *A = ArgumentSCC[0]->Definition; - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; Changed = true; } @@ -671,7 +729,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B)); ++NumNoCapture; Changed = true; } @@ -708,8 +766,9 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; // Clear out existing readonly/readnone attributes - A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, R)); - A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B)); + A->removeAttr( + AttributeList::get(A->getContext(), A->getArgNo() + 1, R)); + A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B)); ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; Changed = true; } @@ -769,7 +828,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { case Instruction::Call: case Instruction::Invoke: { CallSite CS(RVI); - if (CS.paramHasAttr(0, Attribute::NoAlias)) + if (CS.hasRetAttr(Attribute::NoAlias)) break; if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction())) break; @@ -905,7 +964,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { // pointers. for (Function *F : SCCNodes) { // Already nonnull. - if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull)) continue; @@ -926,7 +985,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { // Mark the function eagerly since we may discover a function // which prevents us from speculating about the entire SCC DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n"); - F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); ++NumNonNullReturn; MadeChange = true; } @@ -939,13 +998,13 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) { if (SCCReturnsNonNull) { for (Function *F : SCCNodes) { - if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex, + if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Attribute::NonNull) || !F->getReturnType()->isPointerTy()) continue; DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n"); - F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); ++NumNonNullReturn; MadeChange = true; } @@ -1163,19 +1222,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) { bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) { if (skipSCC(SCC)) return false; - - // We compute dedicated AA results for each function in the SCC as needed. We - // use a lambda referencing external objects so that they live long enough to - // be queried, but we re-use them each time. - Optional<BasicAAResult> BAR; - Optional<AAResults> AAR; - auto AARGetter = [&](Function &F) -> AAResults & { - BAR.emplace(createLegacyPMBasicAAResult(*this, F)); - AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); - return *AAR; - }; - - return runImpl(SCC, AARGetter); + return runImpl(SCC, LegacyAARGetter(*this)); } namespace { @@ -1275,16 +1322,9 @@ PreservedAnalyses ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) { auto &CG = AM.getResult<CallGraphAnalysis>(M); - bool Changed = deduceFunctionAttributeInRPO(M, CG); - - // CallGraphAnalysis holds AssertingVH and must be invalidated eagerly so - // that other passes don't delete stuff from under it. - // FIXME: We need to invalidate this to avoid PR28400. Is there a better - // solution? - AM.invalidate<CallGraphAnalysis>(M); - - if (!Changed) + if (!deduceFunctionAttributeInRPO(M, CG)) return PreservedAnalyses::all(); + PreservedAnalyses PA; PA.preserve<CallGraphAnalysis>(); return PA; diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 6b32f6c31f72..d66411f04cc4 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -75,12 +75,6 @@ static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden, static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden, cl::desc("Compute dead symbols")); -// Temporary allows the function import pass to disable always linking -// referenced discardable symbols. -static cl::opt<bool> - DontForceImportReferencedDiscardableSymbols("disable-force-link-odr", - cl::init(false), cl::Hidden); - static cl::opt<bool> EnableImportMetadata( "enable-import-metadata", cl::init( #if !defined(NDEBUG) @@ -124,7 +118,7 @@ namespace { static const GlobalValueSummary * selectCallee(const ModuleSummaryIndex &Index, const GlobalValueSummaryList &CalleeSummaryList, - unsigned Threshold) { + unsigned Threshold, StringRef CallerModulePath) { auto It = llvm::find_if( CalleeSummaryList, [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) { @@ -145,6 +139,21 @@ selectCallee(const ModuleSummaryIndex &Index, auto *Summary = cast<FunctionSummary>(GVSummary); + // If this is a local function, make sure we import the copy + // in the caller's module. The only time a local function can + // share an entry in the index is if there is a local with the same name + // in another module that had the same source file name (in a different + // directory), where each was compiled in their own directory so there + // was not distinguishing path. + // However, do the import from another module if there is only one + // entry in the list - in that case this must be a reference due + // to indirect call profile data, since a function pointer can point to + // a local in another module. + if (GlobalValue::isLocalLinkage(Summary->linkage()) && + CalleeSummaryList.size() > 1 && + Summary->modulePath() != CallerModulePath) + return false; + if (Summary->instCount() > Threshold) return false; @@ -163,11 +172,13 @@ selectCallee(const ModuleSummaryIndex &Index, /// null if there's no match. static const GlobalValueSummary *selectCallee(GlobalValue::GUID GUID, unsigned Threshold, - const ModuleSummaryIndex &Index) { + const ModuleSummaryIndex &Index, + StringRef CallerModulePath) { auto CalleeSummaryList = Index.findGlobalValueSummaryList(GUID); if (CalleeSummaryList == Index.end()) return nullptr; // This function does not have a summary - return selectCallee(Index, CalleeSummaryList->second, Threshold); + return selectCallee(Index, CalleeSummaryList->second, Threshold, + CallerModulePath); } using EdgeInfo = std::tuple<const FunctionSummary *, unsigned /* Threshold */, @@ -186,6 +197,15 @@ static void computeImportForFunction( auto GUID = Edge.first.getGUID(); DEBUG(dbgs() << " edge -> " << GUID << " Threshold:" << Threshold << "\n"); + if (Index.findGlobalValueSummaryList(GUID) == Index.end()) { + // For SamplePGO, the indirect call targets for local functions will + // have its original name annotated in profile. We try to find the + // corresponding PGOFuncName as the GUID. + GUID = Index.getGUIDFromOriginalID(GUID); + if (GUID == 0) + continue; + } + if (DefinedGVSummaries.count(GUID)) { DEBUG(dbgs() << "ignored! Target already in destination module.\n"); continue; @@ -202,7 +222,8 @@ static void computeImportForFunction( const auto NewThreshold = Threshold * GetBonusMultiplier(Edge.second.Hotness); - auto *CalleeSummary = selectCallee(GUID, NewThreshold, Index); + auto *CalleeSummary = + selectCallee(GUID, NewThreshold, Index, Summary.modulePath()); if (!CalleeSummary) { DEBUG(dbgs() << "ignored! No qualifying callee with summary found.\n"); continue; @@ -522,6 +543,23 @@ llvm::EmitImportsFiles(StringRef ModulePath, StringRef OutputFilename, /// Fixup WeakForLinker linkages in \p TheModule based on summary analysis. void llvm::thinLTOResolveWeakForLinkerModule( Module &TheModule, const GVSummaryMapTy &DefinedGlobals) { + auto ConvertToDeclaration = [](GlobalValue &GV) { + DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName() << "\n"); + if (Function *F = dyn_cast<Function>(&GV)) { + F->deleteBody(); + F->clearMetadata(); + } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) { + V->setInitializer(nullptr); + V->setLinkage(GlobalValue::ExternalLinkage); + V->clearMetadata(); + } else + // For now we don't resolve or drop aliases. Once we do we'll + // need to add support here for creating either a function or + // variable declaration, and return the new GlobalValue* for + // the caller to use. + llvm_unreachable("Expected function or variable"); + }; + auto updateLinkage = [&](GlobalValue &GV) { if (!GlobalValue::isWeakForLinker(GV.getLinkage())) return; @@ -532,18 +570,25 @@ void llvm::thinLTOResolveWeakForLinkerModule( auto NewLinkage = GS->second->linkage(); if (NewLinkage == GV.getLinkage()) return; - DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from " - << GV.getLinkage() << " to " << NewLinkage << "\n"); - GV.setLinkage(NewLinkage); - // Remove functions converted to available_externally from comdats, + // Check for a non-prevailing def that has interposable linkage + // (e.g. non-odr weak or linkonce). In that case we can't simply + // convert to available_externally, since it would lose the + // interposable property and possibly get inlined. Simply drop + // the definition in that case. + if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) && + GlobalValue::isInterposableLinkage(GV.getLinkage())) + ConvertToDeclaration(GV); + else { + DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName() << "` from " + << GV.getLinkage() << " to " << NewLinkage << "\n"); + GV.setLinkage(NewLinkage); + } + // Remove declarations from comdats, including available_externally // as this is a declaration for the linker, and will be dropped eventually. // It is illegal for comdats to contain declarations. auto *GO = dyn_cast_or_null<GlobalObject>(&GV); - if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) { - assert(GO->hasAvailableExternallyLinkage() && - "Expected comdat on definition (possibly available external)"); + if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) GO->setComdat(nullptr); - } }; // Process functions and global now @@ -562,7 +607,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // the current module. StringSet<> AsmUndefinedRefs; ModuleSymbolTable::CollectAsmSymbols( - Triple(TheModule.getTargetTriple()), TheModule.getModuleInlineAsm(), + TheModule, [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) { if (Flags & object::BasicSymbolRef::SF_Undefined) AsmUndefinedRefs.insert(Name); @@ -617,14 +662,12 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // index. // Expected<bool> FunctionImporter::importFunctions( - Module &DestModule, const FunctionImporter::ImportMapTy &ImportList, - bool ForceImportReferencedDiscardableSymbols) { + Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) { DEBUG(dbgs() << "Starting import for Module " << DestModule.getModuleIdentifier() << "\n"); unsigned ImportedCount = 0; - // Linker that will be used for importing function - Linker TheLinker(DestModule); + IRMover Mover(DestModule); // Do the actual import of functions now, one Module at a time std::set<StringRef> ModuleNameOrderedList; for (auto &FunctionsToImportPerModule : ImportList) { @@ -648,7 +691,7 @@ Expected<bool> FunctionImporter::importFunctions( auto &ImportGUIDs = FunctionsToImportPerModule->second; // Find the globals to import - DenseSet<const GlobalValue *> GlobalsToImport; + SetVector<GlobalValue *> GlobalsToImport; for (Function &F : *SrcModule) { if (!F.hasName()) continue; @@ -687,6 +730,13 @@ Expected<bool> FunctionImporter::importFunctions( } } for (GlobalAlias &GA : SrcModule->aliases()) { + // FIXME: This should eventually be controlled entirely by the summary. + if (FunctionImportGlobalProcessing::doImportAsDefinition( + &GA, &GlobalsToImport)) { + GlobalsToImport.insert(&GA); + continue; + } + if (!GA.hasName()) continue; auto GUID = GA.getGUID(); @@ -731,12 +781,9 @@ Expected<bool> FunctionImporter::importFunctions( << " from " << SrcModule->getSourceFileName() << "\n"; } - // Instruct the linker that the client will take care of linkonce resolution - unsigned Flags = Linker::Flags::None; - if (!ForceImportReferencedDiscardableSymbols) - Flags |= Linker::Flags::DontForceLinkLinkonceODR; - - if (TheLinker.linkInModule(std::move(SrcModule), Flags, &GlobalsToImport)) + if (Mover.move(std::move(SrcModule), GlobalsToImport.getArrayRef(), + [](GlobalValue &, IRMover::ValueAdder) {}, + /*IsPerformingImport=*/true)) report_fatal_error("Function Import: link error"); ImportedCount += GlobalsToImport.size(); @@ -796,8 +843,7 @@ static bool doImportingForModule(Module &M) { return loadFile(Identifier, M.getContext()); }; FunctionImporter Importer(*Index, ModuleLoader); - Expected<bool> Result = Importer.importFunctions( - M, ImportList, !DontForceImportReferencedDiscardableSymbols); + Expected<bool> Result = Importer.importFunctions(M, ImportList); // FIXME: Probably need to propagate Errors through the pass manager. if (!Result) { diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index 7a04de3d12db..c91e8b454927 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -25,7 +25,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" -#include <unordered_map> + using namespace llvm; #define DEBUG_TYPE "globaldce" @@ -50,7 +50,14 @@ namespace { if (skipModule(M)) return false; + // We need a minimally functional dummy module analysis manager. It needs + // to at least know about the possibility of proxying a function analysis + // manager. + FunctionAnalysisManager DummyFAM; ModuleAnalysisManager DummyMAM; + DummyMAM.registerPass( + [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); }); + auto PA = Impl.run(M, DummyMAM); return !PA.areAllPreserved(); } @@ -78,9 +85,67 @@ static bool isEmptyFunction(Function *F) { return RI.getReturnValue() == nullptr; } -PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { +/// Compute the set of GlobalValue that depends from V. +/// The recursion stops as soon as a GlobalValue is met. +void GlobalDCEPass::ComputeDependencies(Value *V, + SmallPtrSetImpl<GlobalValue *> &Deps) { + if (auto *I = dyn_cast<Instruction>(V)) { + Function *Parent = I->getParent()->getParent(); + Deps.insert(Parent); + } else if (auto *GV = dyn_cast<GlobalValue>(V)) { + Deps.insert(GV); + } else if (auto *CE = dyn_cast<Constant>(V)) { + // Avoid walking the whole tree of a big ConstantExprs multiple times. + auto Where = ConstantDependenciesCache.find(CE); + if (Where != ConstantDependenciesCache.end()) { + auto const &K = Where->second; + Deps.insert(K.begin(), K.end()); + } else { + SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE]; + for (User *CEUser : CE->users()) + ComputeDependencies(CEUser, LocalDeps); + Deps.insert(LocalDeps.begin(), LocalDeps.end()); + } + } +} + +void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) { + SmallPtrSet<GlobalValue *, 8> Deps; + for (User *User : GV.users()) + ComputeDependencies(User, Deps); + Deps.erase(&GV); // Remove self-reference. + for (GlobalValue *GVU : Deps) { + GVDependencies.insert(std::make_pair(GVU, &GV)); + } +} + +/// Mark Global value as Live +void GlobalDCEPass::MarkLive(GlobalValue &GV, + SmallVectorImpl<GlobalValue *> *Updates) { + auto const Ret = AliveGlobals.insert(&GV); + if (!Ret.second) + return; + + if (Updates) + Updates->push_back(&GV); + if (Comdat *C = GV.getComdat()) { + for (auto &&CM : make_range(ComdatMembers.equal_range(C))) + MarkLive(*CM.second, Updates); // Recursion depth is only two because only + // globals in the same comdat are visited. + } +} + +PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { bool Changed = false; + // The algorithm first computes the set L of global variables that are + // trivially live. Then it walks the initialization of these variables to + // compute the globals used to initialize them, which effectively builds a + // directed graph where nodes are global variables, and an edge from A to B + // means B is used to initialize A. Finally, it propagates the liveness + // information through the graph starting from the nodes in L. Nodes note + // marked as alive are discarded. + // Remove empty functions from the global ctors list. Changed |= optimizeGlobalCtorsList(M, isEmptyFunction); @@ -103,21 +168,39 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { // initializer. if (!GO.isDeclaration() && !GO.hasAvailableExternallyLinkage()) if (!GO.isDiscardableIfUnused()) - GlobalIsNeeded(&GO); + MarkLive(GO); + + UpdateGVDependencies(GO); } + // Compute direct dependencies of aliases. for (GlobalAlias &GA : M.aliases()) { Changed |= RemoveUnusedGlobalValue(GA); // Externally visible aliases are needed. if (!GA.isDiscardableIfUnused()) - GlobalIsNeeded(&GA); + MarkLive(GA); + + UpdateGVDependencies(GA); } + // Compute direct dependencies of ifuncs. for (GlobalIFunc &GIF : M.ifuncs()) { Changed |= RemoveUnusedGlobalValue(GIF); // Externally visible ifuncs are needed. if (!GIF.isDiscardableIfUnused()) - GlobalIsNeeded(&GIF); + MarkLive(GIF); + + UpdateGVDependencies(GIF); + } + + // Propagate liveness from collected Global Values through the computed + // dependencies. + SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(), + AliveGlobals.end()}; + while (!NewLiveGVs.empty()) { + GlobalValue *LGV = NewLiveGVs.pop_back_val(); + for (auto &&GVD : make_range(GVDependencies.equal_range(LGV))) + MarkLive(*GVD.second, &NewLiveGVs); } // Now that all globals which are needed are in the AliveGlobals set, we loop @@ -154,7 +237,7 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { GA.setAliasee(nullptr); } - // The third pass drops targets of ifuncs which are dead... + // The fourth pass drops targets of ifuncs which are dead... std::vector<GlobalIFunc*> DeadIFuncs; for (GlobalIFunc &GIF : M.ifuncs()) if (!AliveGlobals.count(&GIF)) { @@ -188,7 +271,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { // Make sure that all memory is released AliveGlobals.clear(); - SeenConstants.clear(); + ConstantDependenciesCache.clear(); + GVDependencies.clear(); ComdatMembers.clear(); if (Changed) @@ -196,60 +280,6 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &) { return PreservedAnalyses::all(); } -/// GlobalIsNeeded - the specific global value as needed, and -/// recursively mark anything that it uses as also needed. -void GlobalDCEPass::GlobalIsNeeded(GlobalValue *G) { - // If the global is already in the set, no need to reprocess it. - if (!AliveGlobals.insert(G).second) - return; - - if (Comdat *C = G->getComdat()) { - for (auto &&CM : make_range(ComdatMembers.equal_range(C))) - GlobalIsNeeded(CM.second); - } - - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(G)) { - // If this is a global variable, we must make sure to add any global values - // referenced by the initializer to the alive set. - if (GV->hasInitializer()) - MarkUsedGlobalsAsNeeded(GV->getInitializer()); - } else if (GlobalIndirectSymbol *GIS = dyn_cast<GlobalIndirectSymbol>(G)) { - // The target of a global alias or ifunc is needed. - MarkUsedGlobalsAsNeeded(GIS->getIndirectSymbol()); - } else { - // Otherwise this must be a function object. We have to scan the body of - // the function looking for constants and global values which are used as - // operands. Any operands of these types must be processed to ensure that - // any globals used will be marked as needed. - Function *F = cast<Function>(G); - - for (Use &U : F->operands()) - MarkUsedGlobalsAsNeeded(cast<Constant>(U.get())); - - for (BasicBlock &BB : *F) - for (Instruction &I : BB) - for (Use &U : I.operands()) - if (GlobalValue *GV = dyn_cast<GlobalValue>(U)) - GlobalIsNeeded(GV); - else if (Constant *C = dyn_cast<Constant>(U)) - MarkUsedGlobalsAsNeeded(C); - } -} - -void GlobalDCEPass::MarkUsedGlobalsAsNeeded(Constant *C) { - if (GlobalValue *GV = dyn_cast<GlobalValue>(C)) - return GlobalIsNeeded(GV); - - // Loop over all of the operands of the constant, adding any globals they - // use to the list of needed globals. - for (Use &U : C->operands()) { - // If we've already processed this constant there's no need to do it again. - Constant *Op = dyn_cast<Constant>(U); - if (Op && SeenConstants.insert(Op).second) - MarkUsedGlobalsAsNeeded(Op); - } -} - // RemoveUnusedGlobalValue - Loop over all of the uses of the specified // GlobalValue, looking for the constant pointer ref that may be pointing to it. // If found, check to see if the constant pointer ref is safe to destroy, and if diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 5b0d5e3bc01e..ade4f21ceb52 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -1819,12 +1819,14 @@ static bool processInternalGlobal( GS.AccessingFunction->doesNotRecurse() && isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV, LookupDomTree)) { + const DataLayout &DL = GV->getParent()->getDataLayout(); + DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n"); Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction ->getEntryBlock().begin()); Type *ElemTy = GV->getValueType(); // FIXME: Pass Global's alignment when globals have alignment - AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr, + AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr, GV->getName(), &FirstI); if (!isa<UndefValue>(GV->getInitializer())) new StoreInst(GV->getInitializer(), Alloca, &FirstI); @@ -1977,7 +1979,7 @@ static void ChangeCalleesToFastCall(Function *F) { } } -static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) { +static AttributeList StripNest(LLVMContext &C, const AttributeList &Attrs) { for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) { unsigned Index = Attrs.getSlotIndex(i); if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest)) @@ -2387,7 +2389,7 @@ OptimizeGlobalAliases(Module &M, } static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { - LibFunc::Func F = LibFunc::cxa_atexit; + LibFunc F = LibFunc_cxa_atexit; if (!TLI->has(F)) return nullptr; @@ -2396,7 +2398,7 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { return nullptr; // Make sure that the function has the correct prototype. - if (!TLI->getLibFunc(*Fn, F) || F != LibFunc::cxa_atexit) + if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit) return nullptr; return Fn; diff --git a/lib/Transforms/IPO/GlobalSplit.cpp b/lib/Transforms/IPO/GlobalSplit.cpp index bbbd096e89c0..4705ebe265ae 100644 --- a/lib/Transforms/IPO/GlobalSplit.cpp +++ b/lib/Transforms/IPO/GlobalSplit.cpp @@ -85,7 +85,16 @@ bool splitGlobal(GlobalVariable &GV) { uint64_t ByteOffset = cast<ConstantInt>( cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) ->getZExtValue(); - if (ByteOffset < SplitBegin || ByteOffset >= SplitEnd) + // Type metadata may be attached one byte after the end of the vtable, for + // classes without virtual methods in Itanium ABI. AFAIK, it is never + // attached to the first byte of a vtable. Subtract one to get the right + // slice. + // This is making an assumption that vtable groups are the only kinds of + // global variables that !type metadata can be attached to, and that they + // are either Itanium ABI vtable groups or contain a single vtable (i.e. + // Microsoft ABI vtables). + uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1; + if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd) continue; SplitGV->addMetadata( LLVMContext::MD_type, diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp index 916135e33cd5..349807496dc2 100644 --- a/lib/Transforms/IPO/IPConstantPropagation.cpp +++ b/lib/Transforms/IPO/IPConstantPropagation.cpp @@ -136,7 +136,13 @@ static bool PropagateConstantReturn(Function &F) { // For more details, see GlobalValue::mayBeDerefined. if (!F.isDefinitionExact()) return false; - + + // Don't touch naked functions. The may contain asm returning + // value we don't see, so we may end up interprocedurally propagating + // the return value incorrectly. + if (F.hasFnAttribute(Attribute::Naked)) + return false; + // Check to see if this function returns a constant. SmallVector<Value *,4> RetVals; StructType *STy = dyn_cast<StructType>(F.getReturnType()); diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp index 1770445b413f..50e7cc89a3b3 100644 --- a/lib/Transforms/IPO/InlineSimple.cpp +++ b/lib/Transforms/IPO/InlineSimple.cpp @@ -48,7 +48,7 @@ public: } explicit SimpleInliner(InlineParams Params) - : LegacyInlinerBase(ID), Params(Params) { + : LegacyInlinerBase(ID), Params(std::move(Params)) { initializeSimpleInlinerPass(*PassRegistry::getPassRegistry()); } @@ -61,7 +61,8 @@ public: [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, PSI); + return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache, + /*GetBFI=*/None, PSI); } bool runOnSCC(CallGraphSCC &SCC) override; @@ -92,8 +93,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) { } Pass *llvm::createFunctionInliningPass(unsigned OptLevel, - unsigned SizeOptLevel) { - return new SimpleInliner(llvm::getInlineParams(OptLevel, SizeOptLevel)); + unsigned SizeOptLevel, + bool DisableInlineHotCallSite) { + auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel); + if (DisableInlineHotCallSite) + Param.HotCallSiteThreshold = 0; + return new SimpleInliner(Param); } Pass *llvm::createFunctionInliningPass(InlineParams &Params) { diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 3f4731c937d1..6c83c99ae3be 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -19,6 +19,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -260,8 +261,8 @@ static bool InlineCallIfPossible( /// Return true if inlining of CS can block the caller from being /// inlined which is proved to be more beneficial. \p IC is the /// estimated inline cost associated with callsite \p CS. -/// \p TotalAltCost will be set to the estimated cost of inlining the caller -/// if \p CS is suppressed for inlining. +/// \p TotalSecondaryCost will be set to the estimated cost of inlining the +/// caller if \p CS is suppressed for inlining. static bool shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, int &TotalSecondaryCost, @@ -288,7 +289,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, // treating them as truly abstract units etc. TotalSecondaryCost = 0; // The candidate cost to be imposed upon the current function. - int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1); + int CandidateCost = IC.getCost() - 1; // This bool tracks what happens if we do NOT inline C into B. bool callerWillBeRemoved = Caller->hasLocalLinkage(); // This bool tracks what happens if we DO inline C into B. @@ -325,7 +326,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC, // one is set very low by getInlineCost, in anticipation that Caller will // be removed entirely. We did not account for this above unless there // is only one caller of Caller. - if (callerWillBeRemoved && !Caller->use_empty()) + if (callerWillBeRemoved && !Caller->hasOneUse()) TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus; if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost()) @@ -342,6 +343,7 @@ static bool shouldInline(CallSite CS, InlineCost IC = GetInlineCost(CS); Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); if (IC.isAlways()) { DEBUG(dbgs() << " Inlining: cost=always" @@ -355,19 +357,20 @@ static bool shouldInline(CallSite CS, if (IC.isNever()) { DEBUG(dbgs() << " NOT Inlining: cost=never" << ", Call: " << *CS.getInstruction() << "\n"); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NeverInline", Call) - << NV("Callee", Callee) - << " should never be inlined (cost=never)"); + ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) + << NV("Callee", Callee) << " not inlined into " + << NV("Caller", Caller) + << " because it should never be inlined (cost=never)"); return false; } - Function *Caller = CS.getCaller(); if (!IC) { DEBUG(dbgs() << " NOT Inlining: cost=" << IC.getCost() << ", thres=" << (IC.getCostDelta() + IC.getCost()) << ", Call: " << *CS.getInstruction() << "\n"); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) - << NV("Callee", Callee) << " too costly to inline (cost=" + ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) + << NV("Callee", Callee) << " not inlined into " + << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return false; @@ -378,8 +381,8 @@ static bool shouldInline(CallSite CS, DEBUG(dbgs() << " NOT Inlining: " << *CS.getInstruction() << " Cost = " << IC.getCost() << ", outer Cost = " << TotalSecondaryCost << '\n'); - ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, - "IncreaseCostInOtherContexts", Call) + ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts", + Call) << "Not inlining. Cost of inlining " << NV("Callee", Callee) << " increases the cost of inlining " << NV("Caller", Caller) << " in other contexts"); @@ -552,16 +555,11 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, // If the policy determines that we should inline this function, // try to do so. - using namespace ore; - if (!shouldInline(CS, GetInlineCost, ORE)) { - ORE.emit( - OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block) - << NV("Callee", Callee) << " will not be inlined into " - << NV("Caller", Caller)); + if (!shouldInline(CS, GetInlineCost, ORE)) continue; - } // Attempt to inline the function. + using namespace ore; if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas, InlineHistoryID, InsertLifetime, AARGetter, ImportedFunctionsStats)) { @@ -638,22 +636,12 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) { ACT = &getAnalysis<AssumptionCacheTracker>(); PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - // We compute dedicated AA results for each function in the SCC as needed. We - // use a lambda referencing external objects so that they live long enough to - // be queried, but we re-use them each time. - Optional<BasicAAResult> BAR; - Optional<AAResults> AAR; - auto AARGetter = [&](Function &F) -> AAResults & { - BAR.emplace(createLegacyPMBasicAAResult(*this, F)); - AAR.emplace(createLegacyPMAAResults(*this, F, *BAR)); - return *AAR; - }; auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime, [this](CallSite CS) { return getInlineCost(CS); }, - AARGetter, ImportedFunctionsStats); + LegacyAARGetter(*this), ImportedFunctionsStats); } /// Remove now-dead linkonce functions at the end of @@ -750,9 +738,6 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG, PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR) { - FunctionAnalysisManager &FAM = - AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG) - .getManager(); const ModuleAnalysisManager &MAM = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager(); bool Changed = false; @@ -761,35 +746,52 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, Module &M = *InitialC.begin()->getFunction().getParent(); ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M); - std::function<AssumptionCache &(Function &)> GetAssumptionCache = - [&](Function &F) -> AssumptionCache & { - return FAM.getResult<AssumptionAnalysis>(F); - }; - - // Setup the data structure used to plumb customization into the - // `InlineFunction` routine. - InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache); + // We use a single common worklist for calls across the entire SCC. We + // process these in-order and append new calls introduced during inlining to + // the end. + // + // Note that this particular order of processing is actually critical to + // avoid very bad behaviors. Consider *highly connected* call graphs where + // each function contains a small amonut of code and a couple of calls to + // other functions. Because the LLVM inliner is fundamentally a bottom-up + // inliner, it can handle gracefully the fact that these all appear to be + // reasonable inlining candidates as it will flatten things until they become + // too big to inline, and then move on and flatten another batch. + // + // However, when processing call edges *within* an SCC we cannot rely on this + // bottom-up behavior. As a consequence, with heavily connected *SCCs* of + // functions we can end up incrementally inlining N calls into each of + // N functions because each incremental inlining decision looks good and we + // don't have a topological ordering to prevent explosions. + // + // To compensate for this, we don't process transitive edges made immediate + // by inlining until we've done one pass of inlining across the entire SCC. + // Large, highly connected SCCs still lead to some amount of code bloat in + // this model, but it is uniformly spread across all the functions in the SCC + // and eventually they all become too large to inline, rather than + // incrementally maknig a single function grow in a super linear fashion. + SmallVector<std::pair<CallSite, int>, 16> Calls; - auto GetInlineCost = [&](CallSite CS) { - Function &Callee = *CS.getCalledFunction(); - auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee); - return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, PSI); - }; + // Populate the initial list of calls in this SCC. + for (auto &N : InitialC) { + // We want to generally process call sites top-down in order for + // simplifications stemming from replacing the call with the returned value + // after inlining to be visible to subsequent inlining decisions. + // FIXME: Using instructions sequence is a really bad way to do this. + // Instead we should do an actual RPO walk of the function body. + for (Instruction &I : instructions(N.getFunction())) + if (auto CS = CallSite(&I)) + if (Function *Callee = CS.getCalledFunction()) + if (!Callee->isDeclaration()) + Calls.push_back({CS, -1}); + } + if (Calls.empty()) + return PreservedAnalyses::all(); - // We use a worklist of nodes to process so that we can handle if the SCC - // structure changes and some nodes are no longer part of the current SCC. We - // also need to use an updatable pointer for the SCC as a consequence. - SmallVector<LazyCallGraph::Node *, 16> Nodes; - for (auto &N : InitialC) - Nodes.push_back(&N); + // Capture updatable variables for the current SCC and RefSCC. auto *C = &InitialC; auto *RC = &C->getOuterRefSCC(); - // We also use a secondary worklist of call sites within a particular node to - // allow quickly continuing to inline through newly inlined call sites where - // possible. - SmallVector<std::pair<CallSite, int>, 16> Calls; - // When inlining a callee produces new call sites, we want to keep track of // the fact that they were inlined from the callee. This allows us to avoid // infinite inlining in some obscure cases. To represent this, we use an @@ -805,34 +807,58 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // defer deleting these to make it easier to handle the call graph updates. SmallVector<Function *, 4> DeadFunctions; - do { - auto &N = *Nodes.pop_back_val(); + // Loop forward over all of the calls. Note that we cannot cache the size as + // inlining can introduce new calls that need to be processed. + for (int i = 0; i < (int)Calls.size(); ++i) { + // We expect the calls to typically be batched with sequences of calls that + // have the same caller, so we first set up some shared infrastructure for + // this caller. We also do any pruning we can at this layer on the caller + // alone. + Function &F = *Calls[i].first.getCaller(); + LazyCallGraph::Node &N = *CG.lookup(F); if (CG.lookupSCC(N) != C) continue; - Function &F = N.getFunction(); if (F.hasFnAttribute(Attribute::OptimizeNone)) continue; + DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n"); + + // Get a FunctionAnalysisManager via a proxy for this particular node. We + // do this each time we visit a node as the SCC may have changed and as + // we're going to mutate this particular function we want to make sure the + // proxy is in place to forward any invalidation events. We can use the + // manager we get here for looking up results for functions other than this + // node however because those functions aren't going to be mutated by this + // pass. + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG) + .getManager(); + std::function<AssumptionCache &(Function &)> GetAssumptionCache = + [&](Function &F) -> AssumptionCache & { + return FAM.getResult<AssumptionAnalysis>(F); + }; + auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult<BlockFrequencyAnalysis>(F); + }; + + auto GetInlineCost = [&](CallSite CS) { + Function &Callee = *CS.getCalledFunction(); + auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee); + return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI}, + PSI); + }; + // Get the remarks emission analysis for the caller. auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); - // We want to generally process call sites top-down in order for - // simplifications stemming from replacing the call with the returned value - // after inlining to be visible to subsequent inlining decisions. So we - // walk the function backwards and then process the back of the vector. - // FIXME: Using reverse is a really bad way to do this. Instead we should - // do an actual PO walk of the function body. - for (Instruction &I : reverse(instructions(F))) - if (auto CS = CallSite(&I)) - if (Function *Callee = CS.getCalledFunction()) - if (!Callee->isDeclaration()) - Calls.push_back({CS, -1}); - + // Now process as many calls as we have within this caller in the sequnece. + // We bail out as soon as the caller has to change so we can update the + // call graph and prepare the context of that new caller. bool DidInline = false; - while (!Calls.empty()) { + for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) { int InlineHistoryID; CallSite CS; - std::tie(CS, InlineHistoryID) = Calls.pop_back_val(); + std::tie(CS, InlineHistoryID) = Calls[i]; Function &Callee = *CS.getCalledFunction(); if (InlineHistoryID != -1 && @@ -843,6 +869,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, if (!shouldInline(CS, GetInlineCost, ORE)) continue; + // Setup the data structure used to plumb customization into the + // `InlineFunction` routine. + InlineFunctionInfo IFI( + /*cg=*/nullptr, &GetAssumptionCache, + &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())), + &FAM.getResult<BlockFrequencyAnalysis>(Callee)); + if (!InlineFunction(CS, IFI)) continue; DidInline = true; @@ -870,6 +903,12 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // made dead by this operation on other functions). Callee.removeDeadConstantUsers(); if (Callee.use_empty()) { + Calls.erase( + std::remove_if(Calls.begin() + i + 1, Calls.end(), + [&Callee](const std::pair<CallSite, int> &Call) { + return Call.first.getCaller() == &Callee; + }), + Calls.end()); // Clear the body and queue the function itself for deletion when we // finish inlining and call graph updates. // Note that after this point, it is an error to do anything other @@ -882,6 +921,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, } } + // Back the call index up by one to put us in a good position to go around + // the outer loop. + --i; + if (!DidInline) continue; Changed = true; @@ -896,8 +939,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // below. for (Function *InlinedCallee : InlinedCallees) { LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee); - for (LazyCallGraph::Edge &E : CalleeN) - RC->insertTrivialRefEdge(N, *E.getNode()); + for (LazyCallGraph::Edge &E : *CalleeN) + RC->insertTrivialRefEdge(N, E.getNode()); } InlinedCallees.clear(); @@ -908,8 +951,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // re-use the exact same logic for updating the call graph to reflect the // change.. C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR); + DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n"); RC = &C->getOuterRefSCC(); - } while (!Nodes.empty()); + } // Now that we've finished inlining all of the calls across this SCC, delete // all of the trivially dead functions, updating the call graph and the CGSCC @@ -920,8 +964,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // sets. for (Function *DeadF : DeadFunctions) { // Get the necessary information out of the call graph and nuke the - // function there. + // function there. Also, cclear out any cached analyses. auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF)); + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG) + .getManager(); + FAM.clear(*DeadF); + AM.clear(DeadC); auto &DeadRC = DeadC.getOuterRefSCC(); CG.removeDeadFunction(*DeadF); diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp index deb7e819480b..785207efbe5c 100644 --- a/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/lib/Transforms/IPO/LowerTypeTests.cpp @@ -42,8 +42,6 @@ using namespace llvm; using namespace lowertypetests; -using SummaryAction = LowerTypeTestsSummaryAction; - #define DEBUG_TYPE "lowertypetests" STATISTIC(ByteArraySizeBits, "Byte array size in bits"); @@ -57,13 +55,13 @@ static cl::opt<bool> AvoidReuse( cl::desc("Try to avoid reuse of byte array addresses using aliases"), cl::Hidden, cl::init(true)); -static cl::opt<SummaryAction> ClSummaryAction( +static cl::opt<PassSummaryAction> ClSummaryAction( "lowertypetests-summary-action", cl::desc("What to do with the summary when running this pass"), - cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"), - clEnumValN(SummaryAction::Import, "import", + cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"), + clEnumValN(PassSummaryAction::Import, "import", "Import typeid resolutions from summary and globals"), - clEnumValN(SummaryAction::Export, "export", + clEnumValN(PassSummaryAction::Export, "export", "Export typeid resolutions to summary and globals")), cl::Hidden); @@ -234,8 +232,8 @@ public: class LowerTypeTestsModule { Module &M; - SummaryAction Action; - ModuleSummaryIndex *Summary; + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; bool LinkerSubsectionsViaSymbols; Triple::ArchType Arch; @@ -253,15 +251,21 @@ class LowerTypeTestsModule { // Indirect function call index assignment counter for WebAssembly uint64_t IndirectIndex = 1; - // Mapping from type identifiers to the call sites that test them. - DenseMap<Metadata *, std::vector<CallInst *>> TypeTestCallSites; + // Mapping from type identifiers to the call sites that test them, as well as + // whether the type identifier needs to be exported to ThinLTO backends as + // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId). + struct TypeIdUserInfo { + std::vector<CallInst *> CallSites; + bool IsExported = false; + }; + DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers; /// This structure describes how to lower type tests for a particular type /// identifier. It is either built directly from the global analysis (during /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type /// identifier summaries and external symbol references (in ThinLTO backends). struct TypeIdLowering { - TypeTestResolution::Kind TheKind; + TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat; /// All except Unsat: the start address within the combined global. Constant *OffsetedGlobal; @@ -274,9 +278,6 @@ class LowerTypeTestsModule { /// covering members of this type identifier as a multiple of 2^AlignLog2. Constant *SizeM1; - /// ByteArray, Inline, AllOnes: range of SizeM1 expressed as a bit width. - unsigned SizeM1BitWidth; - /// ByteArray: the byte array to test the address against. Constant *TheByteArray; @@ -291,6 +292,10 @@ class LowerTypeTestsModule { Function *WeakInitializerFn = nullptr; + void exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); + TypeIdLowering importTypeId(StringRef TypeId); + void importTypeTest(CallInst *CI); + BitSetInfo buildBitSet(Metadata *TypeId, const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout); @@ -327,8 +332,8 @@ class LowerTypeTestsModule { void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions); public: - LowerTypeTestsModule(Module &M, SummaryAction Action, - ModuleSummaryIndex *Summary); + LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary); bool lower(); // Lower the module using the action and summary passed as command line @@ -341,15 +346,17 @@ struct LowerTypeTests : public ModulePass { bool UseCommandLine = false; - SummaryAction Action; - ModuleSummaryIndex *Summary; + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; LowerTypeTests() : ModulePass(ID), UseCommandLine(true) { initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); } - LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary) - : ModulePass(ID), Action(Action), Summary(Summary) { + LowerTypeTests(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : ModulePass(ID), ExportSummary(ExportSummary), + ImportSummary(ImportSummary) { initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry()); } @@ -358,7 +365,7 @@ struct LowerTypeTests : public ModulePass { return false; if (UseCommandLine) return LowerTypeTestsModule::runForTesting(M); - return LowerTypeTestsModule(M, Action, Summary).lower(); + return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower(); } }; @@ -368,9 +375,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false, false) char LowerTypeTests::ID = 0; -ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action, - ModuleSummaryIndex *Summary) { - return new LowerTypeTests(Action, Summary); +ModulePass * +llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) { + return new LowerTypeTests(ExportSummary, ImportSummary); } /// Build a bit set for TypeId using the object layouts in @@ -494,10 +502,11 @@ Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B, return createMaskedBitTest(B, TIL.InlineBits, BitOffset); } else { Constant *ByteArray = TIL.TheByteArray; - if (!LinkerSubsectionsViaSymbols && AvoidReuse) { + if (!LinkerSubsectionsViaSymbols && AvoidReuse && !ImportSummary) { // Each use of the byte array uses a different alias. This makes the // backend less likely to reuse previously computed byte array addresses, // improving the security of the CFI mechanism based on this pass. + // This won't work when importing because TheByteArray is external. ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage, "bits_use", ByteArray, &M); } @@ -593,8 +602,7 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI, IntPtrTy)); Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL); - Constant *BitSizeConst = ConstantExpr::getZExt(TIL.SizeM1, IntPtrTy); - Value *OffsetInRange = B.CreateICmpULE(BitOffset, BitSizeConst); + Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1); // If the bit set is all ones, testing against it is unnecessary. if (TIL.TheKind == TypeTestResolution::AllOnes) @@ -687,6 +695,123 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables( } } +/// Export the given type identifier so that ThinLTO backends may import it. +/// Type identifiers are exported by adding coarse-grained information about how +/// to test the type identifier to the summary, and creating symbols in the +/// object file (aliases and absolute symbols) containing fine-grained +/// information about the type identifier. +void LowerTypeTestsModule::exportTypeId(StringRef TypeId, + const TypeIdLowering &TIL) { + TypeTestResolution &TTRes = + ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes; + TTRes.TheKind = TIL.TheKind; + + auto ExportGlobal = [&](StringRef Name, Constant *C) { + GlobalAlias *GA = + GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage, + "__typeid_" + TypeId + "_" + Name, C, &M); + GA->setVisibility(GlobalValue::HiddenVisibility); + }; + + if (TIL.TheKind != TypeTestResolution::Unsat) + ExportGlobal("global_addr", TIL.OffsetedGlobal); + + if (TIL.TheKind == TypeTestResolution::ByteArray || + TIL.TheKind == TypeTestResolution::Inline || + TIL.TheKind == TypeTestResolution::AllOnes) { + ExportGlobal("align", ConstantExpr::getIntToPtr(TIL.AlignLog2, Int8PtrTy)); + ExportGlobal("size_m1", ConstantExpr::getIntToPtr(TIL.SizeM1, Int8PtrTy)); + + uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1; + if (TIL.TheKind == TypeTestResolution::Inline) + TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6; + else + TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32; + } + + if (TIL.TheKind == TypeTestResolution::ByteArray) { + ExportGlobal("byte_array", TIL.TheByteArray); + ExportGlobal("bit_mask", TIL.BitMask); + } + + if (TIL.TheKind == TypeTestResolution::Inline) + ExportGlobal("inline_bits", + ConstantExpr::getIntToPtr(TIL.InlineBits, Int8PtrTy)); +} + +LowerTypeTestsModule::TypeIdLowering +LowerTypeTestsModule::importTypeId(StringRef TypeId) { + const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId); + if (!TidSummary) + return {}; // Unsat: no globals match this type id. + const TypeTestResolution &TTRes = TidSummary->TTRes; + + TypeIdLowering TIL; + TIL.TheKind = TTRes.TheKind; + + auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) { + Constant *C = + M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty); + auto *GV = dyn_cast<GlobalVariable>(C); + // We only need to set metadata if the global is newly created, in which + // case it would not have hidden visibility. + if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility) + return C; + + GV->setVisibility(GlobalValue::HiddenVisibility); + auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(M.getContext(), {MinC, MaxC})); + }; + if (AbsWidth == IntPtrTy->getBitWidth()) + SetAbsRange(~0ull, ~0ull); // Full set. + else if (AbsWidth) + SetAbsRange(0, 1ull << AbsWidth); + return C; + }; + + if (TIL.TheKind != TypeTestResolution::Unsat) + TIL.OffsetedGlobal = ImportGlobal("global_addr", 0); + + if (TIL.TheKind == TypeTestResolution::ByteArray || + TIL.TheKind == TypeTestResolution::Inline || + TIL.TheKind == TypeTestResolution::AllOnes) { + TIL.AlignLog2 = ConstantExpr::getPtrToInt(ImportGlobal("align", 8), Int8Ty); + TIL.SizeM1 = ConstantExpr::getPtrToInt( + ImportGlobal("size_m1", TTRes.SizeM1BitWidth), IntPtrTy); + } + + if (TIL.TheKind == TypeTestResolution::ByteArray) { + TIL.TheByteArray = ImportGlobal("byte_array", 0); + TIL.BitMask = ImportGlobal("bit_mask", 8); + } + + if (TIL.TheKind == TypeTestResolution::Inline) + TIL.InlineBits = ConstantExpr::getPtrToInt( + ImportGlobal("inline_bits", 1 << TTRes.SizeM1BitWidth), + TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty); + + return TIL; +} + +void LowerTypeTestsModule::importTypeTest(CallInst *CI) { + auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); + if (!TypeIdMDVal) + report_fatal_error("Second argument of llvm.type.test must be metadata"); + + auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata()); + if (!TypeIdStr) + report_fatal_error( + "Second argument of llvm.type.test must be a metadata string"); + + TypeIdLowering TIL = importTypeId(TypeIdStr->getString()); + Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL); + CI->replaceAllUsesWith(Lowered); + CI->eraseFromParent(); +} + void LowerTypeTestsModule::lowerTypeTestCalls( ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr, const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) { @@ -708,16 +833,12 @@ void LowerTypeTestsModule::lowerTypeTestCalls( TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr( Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)), TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2); + TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1); if (BSI.isAllOnes()) { TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single : TypeTestResolution::AllOnes; - TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32; - TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty, - BSI.BitSize - 1); } else if (BSI.BitSize <= 64) { TIL.TheKind = TypeTestResolution::Inline; - TIL.SizeM1BitWidth = (BSI.BitSize <= 32) ? 5 : 6; - TIL.SizeM1 = ConstantInt::get(Int8Ty, BSI.BitSize - 1); uint64_t InlineBits = 0; for (auto Bit : BSI.Bits) InlineBits |= uint64_t(1) << Bit; @@ -728,17 +849,19 @@ void LowerTypeTestsModule::lowerTypeTestCalls( (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits); } else { TIL.TheKind = TypeTestResolution::ByteArray; - TIL.SizeM1BitWidth = (BSI.BitSize <= 128) ? 7 : 32; - TIL.SizeM1 = ConstantInt::get((BSI.BitSize <= 128) ? Int8Ty : Int32Ty, - BSI.BitSize - 1); ++NumByteArraysCreated; ByteArrayInfo *BAI = createByteArray(BSI); TIL.TheByteArray = BAI->ByteArray; TIL.BitMask = BAI->MaskGlobal; } + TypeIdUserInfo &TIUI = TypeIdUsers[TypeId]; + + if (TIUI.IsExported) + exportTypeId(cast<MDString>(TypeId)->getString(), TIL); + // Lower each call to llvm.type.test for this type identifier. - for (CallInst *CI : TypeTestCallSites[TypeId]) { + for (CallInst *CI : TIUI.CallSites) { ++NumTypeTestCallsLowered; Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL); CI->replaceAllUsesWith(Lowered); @@ -757,9 +880,9 @@ void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) { report_fatal_error( "A member of a type identifier may not have an explicit section"); - if (isa<GlobalVariable>(GO) && GO->isDeclarationForLinker()) - report_fatal_error( - "A global var member of a type identifier must be a definition"); + // FIXME: We previously checked that global var member of a type identifier + // must be a definition, but the IR linker may leave type metadata on + // declarations. We should restore this check after fixing PR31759. auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0)); if (!OffsetConstMD) @@ -1173,13 +1296,11 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet( } /// Lower all type tests in this module. -LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action, - ModuleSummaryIndex *Summary) - : M(M), Action(Action), Summary(Summary) { - // FIXME: Use these fields. - (void)this->Action; - (void)this->Summary; - +LowerTypeTestsModule::LowerTypeTestsModule( + Module &M, ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary) { + assert(!(ExportSummary && ImportSummary)); Triple TargetTriple(M.getTargetTriple()); LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX(); Arch = TargetTriple.getArch(); @@ -1203,7 +1324,11 @@ bool LowerTypeTestsModule::runForTesting(Module &M) { ExitOnErr(errorCodeToError(In.error())); } - bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower(); + bool Changed = + LowerTypeTestsModule( + M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, + ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr) + .lower(); if (!ClWriteSummary.empty()) { ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary + @@ -1222,9 +1347,18 @@ bool LowerTypeTestsModule::runForTesting(Module &M) { bool LowerTypeTestsModule::lower() { Function *TypeTestFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_test)); - if (!TypeTestFunc || TypeTestFunc->use_empty()) + if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary) return false; + if (ImportSummary) { + for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end(); + UI != UE;) { + auto *CI = cast<CallInst>((*UI++).getUser()); + importTypeTest(CI); + } + return true; + } + // Equivalence class set containing type identifiers and the globals that // reference them. This is used to partition the set of type identifiers in // the module into disjoint sets. @@ -1248,6 +1382,9 @@ bool LowerTypeTestsModule::lower() { unsigned I = 0; SmallVector<MDNode *, 2> Types; for (GlobalObject &GO : M.global_objects()) { + if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker()) + continue; + Types.clear(); GO.getMetadata(LLVMContext::MD_type, Types); if (Types.empty()) @@ -1262,33 +1399,57 @@ bool LowerTypeTestsModule::lower() { } } - for (const Use &U : TypeTestFunc->uses()) { - auto CI = cast<CallInst>(U.getUser()); + auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & { + // Add the call site to the list of call sites for this type identifier. We + // also use TypeIdUsers to keep track of whether we have seen this type + // identifier before. If we have, we don't need to re-add the referenced + // globals to the equivalence class. + auto Ins = TypeIdUsers.insert({TypeId, {}}); + if (Ins.second) { + // Add the type identifier to the equivalence class. + GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId); + GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); + + // Add the referenced globals to the type identifier's equivalence class. + for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals) + CurSet = GlobalClasses.unionSets( + CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM))); + } + + return Ins.first->second; + }; - auto BitSetMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); - if (!BitSetMDVal) - report_fatal_error("Second argument of llvm.type.test must be metadata"); - auto BitSet = BitSetMDVal->getMetadata(); + if (TypeTestFunc) { + for (const Use &U : TypeTestFunc->uses()) { + auto CI = cast<CallInst>(U.getUser()); - // Add the call site to the list of call sites for this type identifier. We - // also use TypeTestCallSites to keep track of whether we have seen this - // type identifier before. If we have, we don't need to re-add the - // referenced globals to the equivalence class. - std::pair<DenseMap<Metadata *, std::vector<CallInst *>>::iterator, bool> - Ins = TypeTestCallSites.insert( - std::make_pair(BitSet, std::vector<CallInst *>())); - Ins.first->second.push_back(CI); - if (!Ins.second) - continue; + auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1)); + if (!TypeIdMDVal) + report_fatal_error("Second argument of llvm.type.test must be metadata"); + auto TypeId = TypeIdMDVal->getMetadata(); + AddTypeIdUse(TypeId).CallSites.push_back(CI); + } + } - // Add the type identifier to the equivalence class. - GlobalClassesTy::iterator GCI = GlobalClasses.insert(BitSet); - GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); + if (ExportSummary) { + DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID; + for (auto &P : TypeIdInfo) { + if (auto *TypeId = dyn_cast<MDString>(P.first)) + MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back( + TypeId); + } - // Add the referenced globals to the type identifier's equivalence class. - for (GlobalTypeMember *GTM : TypeIdInfo[BitSet].RefGlobals) - CurSet = GlobalClasses.unionSets( - CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM))); + for (auto &P : *ExportSummary) { + for (auto &S : P.second) { + auto *FS = dyn_cast<FunctionSummary>(S.get()); + if (!FS) + continue; + // FIXME: Only add live functions. + for (GlobalValue::GUID G : FS->type_tests()) + for (Metadata *MD : MetadataByGUID[G]) + AddTypeIdUse(MD).IsExported = true; + } + } } if (GlobalClasses.empty()) @@ -1349,8 +1510,9 @@ bool LowerTypeTestsModule::lower() { PreservedAnalyses LowerTypeTestsPass::run(Module &M, ModuleAnalysisManager &AM) { - bool Changed = - LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower(); + bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr, + /*ImportSummary=*/nullptr) + .lower(); if (!Changed) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index e0bb0eb42b59..771770ddc060 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -96,8 +96,10 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" @@ -127,6 +129,26 @@ static cl::opt<unsigned> NumFunctionsForSanityCheck( "'0' disables this check. Works only with '-debug' key."), cl::init(0), cl::Hidden); +// Under option -mergefunc-preserve-debug-info we: +// - Do not create a new function for a thunk. +// - Retain the debug info for a thunk's parameters (and associated +// instructions for the debug info) from the entry block. +// Note: -debug will display the algorithm at work. +// - Create debug-info for the call (to the shared implementation) made by +// a thunk and its return value. +// - Erase the rest of the function, retaining the (minimally sized) entry +// block to create a thunk. +// - Preserve a thunk's call site to point to the thunk even when both occur +// within the same translation unit, to aid debugability. Note that this +// behaviour differs from the underlying -mergefunc implementation which +// modifies the thunk's call site to point to the shared implementation +// when both occur within the same translation unit. +static cl::opt<bool> + MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden, + cl::init(false), + cl::desc("Preserve debug info in thunk when mergefunc " + "transformations are made.")); + namespace { class FunctionNode { @@ -215,8 +237,21 @@ private: /// Replace G with a thunk or an alias to F. Deletes G. void writeThunkOrAlias(Function *F, Function *G); - /// Replace G with a simple tail call to bitcast(F). Also replace direct uses - /// of G with bitcast(F). Deletes G. + /// Fill PDIUnrelatedWL with instructions from the entry block that are + /// unrelated to parameter related debug info. + void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock, + std::vector<Instruction *> &PDIUnrelatedWL); + + /// Erase the rest of the CFG (i.e. barring the entry block). + void eraseTail(Function *G); + + /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the + /// parameter debug info, from the entry block. + void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL); + + /// Replace G with a simple tail call to bitcast(F). Also (unless + /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F), + /// delete G. void writeThunk(Function *F, Function *G); /// Replace G with an alias to F. Deletes G. @@ -269,8 +304,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { if (Res1 != -Res2) { dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber << "\n"; - F1->dump(); - F2->dump(); + dbgs() << *F1 << '\n' << *F2 << '\n'; Valid = false; } @@ -305,9 +339,7 @@ bool MergeFunctions::doSanityCheck(std::vector<WeakVH> &Worklist) { << TripleNumber << "\n"; dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", " << Res4 << "\n"; - F1->dump(); - F2->dump(); - F3->dump(); + dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n'; Valid = false; } } @@ -400,19 +432,15 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { // Transferring other attributes may help other optimizations, but that // should be done uniformly and not in this ad-hoc way. auto &Context = New->getContext(); - auto NewFuncAttrs = New->getAttributes(); - auto CallSiteAttrs = CS.getAttributes(); - - CallSiteAttrs = CallSiteAttrs.addAttributes( - Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes()); - - for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) { - AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx); - if (Attrs.getNumSlots()) - CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs); - } - - CS.setAttributes(CallSiteAttrs); + auto NewPAL = New->getAttributes(); + SmallVector<AttributeSet, 4> NewArgAttrs; + for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) + NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx)); + // Don't transfer attributes from the function to the callee. Function + // attributes typically aren't relevant to the calling convention or ABI. + CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(), + NewPAL.getRetAttributes(), + NewArgAttrs)); remove(CS.getInstruction()->getParent()->getParent()); U->set(BitcastNew); @@ -461,51 +489,242 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) { return Builder.CreateBitCast(V, DestTy); } -// Replace G with a simple tail call to bitcast(F). Also replace direct uses -// of G with bitcast(F). Deletes G. +// Erase the instructions in PDIUnrelatedWL as they are unrelated to the +// parameter debug info, from the entry block. +void MergeFunctions::eraseInstsUnrelatedToPDI( + std::vector<Instruction *> &PDIUnrelatedWL) { + + DEBUG(dbgs() << " Erasing instructions (in reverse order of appearance in " + "entry block) unrelated to parameter debug info from entry " + "block: {\n"); + while (!PDIUnrelatedWL.empty()) { + Instruction *I = PDIUnrelatedWL.back(); + DEBUG(dbgs() << " Deleting Instruction: "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << "\n"); + I->eraseFromParent(); + PDIUnrelatedWL.pop_back(); + } + DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter " + "debug info from entry block. \n"); +} + +// Reduce G to its entry block. +void MergeFunctions::eraseTail(Function *G) { + + std::vector<BasicBlock *> WorklistBB; + for (Function::iterator BBI = std::next(G->begin()), BBE = G->end(); + BBI != BBE; ++BBI) { + BBI->dropAllReferences(); + WorklistBB.push_back(&*BBI); + } + while (!WorklistBB.empty()) { + BasicBlock *BB = WorklistBB.back(); + BB->eraseFromParent(); + WorklistBB.pop_back(); + } +} + +// We are interested in the following instructions from the entry block as being +// related to parameter debug info: +// - @llvm.dbg.declare +// - stores from the incoming parameters to locations on the stack-frame +// - allocas that create these locations on the stack-frame +// - @llvm.dbg.value +// - the entry block's terminator +// The rest are unrelated to debug info for the parameters; fill up +// PDIUnrelatedWL with such instructions. +void MergeFunctions::filterInstsUnrelatedToPDI( + BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) { + + std::set<Instruction *> PDIRelated; + for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end(); + BI != BIE; ++BI) { + if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) { + DEBUG(dbgs() << " Deciding: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + DILocalVariable *DILocVar = DVI->getVariable(); + if (DILocVar->isParameter()) { + DEBUG(dbgs() << " Include (parameter): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(&*BI); + } else { + DEBUG(dbgs() << " Delete (!parameter): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) { + DEBUG(dbgs() << " Deciding: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + DILocalVariable *DILocVar = DDI->getVariable(); + if (DILocVar->isParameter()) { + DEBUG(dbgs() << " Parameter: "); + DEBUG(DILocVar->print(dbgs())); + AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress()); + if (AI) { + DEBUG(dbgs() << " Processing alloca users: "); + DEBUG(dbgs() << "\n"); + for (User *U : AI->users()) { + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (Value *Arg = SI->getValueOperand()) { + if (dyn_cast<Argument>(Arg)) { + DEBUG(dbgs() << " Include: "); + DEBUG(AI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(AI); + DEBUG(dbgs() << " Include (parameter): "); + DEBUG(SI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(SI); + DEBUG(dbgs() << " Include: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(&*BI); + } else { + DEBUG(dbgs() << " Delete (!parameter): "); + DEBUG(SI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + } else { + DEBUG(dbgs() << " Defer: "); + DEBUG(U->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + } else { + DEBUG(dbgs() << " Delete (alloca NULL): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } else { + DEBUG(dbgs() << " Delete (!parameter): "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } else if (dyn_cast<TerminatorInst>(BI) == GEntryBlock->getTerminator()) { + DEBUG(dbgs() << " Will Include Terminator: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIRelated.insert(&*BI); + } else { + DEBUG(dbgs() << " Defer: "); + DEBUG(BI->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + DEBUG(dbgs() + << " Report parameter debug info related/related instructions: {\n"); + for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end(); + BI != BE; ++BI) { + + Instruction *I = &*BI; + if (PDIRelated.find(I) == PDIRelated.end()) { + DEBUG(dbgs() << " !PDIRelated: "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << "\n"); + PDIUnrelatedWL.push_back(I); + } else { + DEBUG(dbgs() << " PDIRelated: "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << "\n"); + } + } + DEBUG(dbgs() << " }\n"); +} + +// Replace G with a simple tail call to bitcast(F). Also (unless +// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F), +// delete G. Under MergeFunctionsPDI, we use G itself for creating +// the thunk as we preserve the debug info (and associated instructions) +// from G's entry block pertaining to G's incoming arguments which are +// passed on as corresponding arguments in the call that G makes to F. +// For better debugability, under MergeFunctionsPDI, we do not modify G's +// call sites to point to F even when within the same translation unit. void MergeFunctions::writeThunk(Function *F, Function *G) { - if (!G->isInterposable()) { - // Redirect direct callers of G to F. + if (!G->isInterposable() && !MergeFunctionsPDI) { + // Redirect direct callers of G to F. (See note on MergeFunctionsPDI + // above). replaceDirectCallers(G, F); } // If G was internal then we may have replaced all uses of G with F. If so, - // stop here and delete G. There's no need for a thunk. - if (G->hasLocalLinkage() && G->use_empty()) { + // stop here and delete G. There's no need for a thunk. (See note on + // MergeFunctionsPDI above). + if (G->hasLocalLinkage() && G->use_empty() && !MergeFunctionsPDI) { G->eraseFromParent(); return; } - Function *NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "", - G->getParent()); - BasicBlock *BB = BasicBlock::Create(F->getContext(), "", NewG); - IRBuilder<> Builder(BB); + BasicBlock *GEntryBlock = nullptr; + std::vector<Instruction *> PDIUnrelatedWL; + BasicBlock *BB = nullptr; + Function *NewG = nullptr; + if (MergeFunctionsPDI) { + DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new " + "function as thunk; retain original: " + << G->getName() << "()\n"); + GEntryBlock = &G->getEntryBlock(); + DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related " + "debug info for " + << G->getName() << "() {\n"); + filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL); + GEntryBlock->getTerminator()->eraseFromParent(); + BB = GEntryBlock; + } else { + NewG = Function::Create(G->getFunctionType(), G->getLinkage(), "", + G->getParent()); + BB = BasicBlock::Create(F->getContext(), "", NewG); + } + IRBuilder<> Builder(BB); + Function *H = MergeFunctionsPDI ? G : NewG; SmallVector<Value *, 16> Args; unsigned i = 0; FunctionType *FFTy = F->getFunctionType(); - for (Argument & AI : NewG->args()) { + for (Argument & AI : H->args()) { Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i))); ++i; } CallInst *CI = Builder.CreateCall(F, Args); + ReturnInst *RI = nullptr; CI->setTailCall(); CI->setCallingConv(F->getCallingConv()); CI->setAttributes(F->getAttributes()); - if (NewG->getReturnType()->isVoidTy()) { - Builder.CreateRetVoid(); + if (H->getReturnType()->isVoidTy()) { + RI = Builder.CreateRetVoid(); } else { - Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType())); + RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType())); } - NewG->copyAttributesFrom(G); - NewG->takeName(G); - removeUsers(G); - G->replaceAllUsesWith(NewG); - G->eraseFromParent(); + if (MergeFunctionsPDI) { + DISubprogram *DIS = G->getSubprogram(); + if (DIS) { + DebugLoc CIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS); + DebugLoc RIDbgLoc = DebugLoc::get(DIS->getScopeLine(), 0, DIS); + CI->setDebugLoc(CIDbgLoc); + RI->setDebugLoc(RIDbgLoc); + } else { + DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for " + << G->getName() << "()\n"); + } + eraseTail(G); + eraseInstsUnrelatedToPDI(PDIUnrelatedWL); + DEBUG(dbgs() << "} // End of parameter related debug info filtering for: " + << G->getName() << "()\n"); + } else { + NewG->copyAttributesFrom(G); + NewG->takeName(G); + removeUsers(G); + G->replaceAllUsesWith(NewG); + G->eraseFromParent(); + } - DEBUG(dbgs() << "writeThunk: " << NewG->getName() << '\n'); + DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n'); ++NumThunksWritten; } diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 7ef3fc1fc2a7..a2f6e5639d9d 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -33,7 +33,7 @@ STATISTIC(NumPartialInlined, "Number of functions partially inlined"); namespace { struct PartialInlinerImpl { - PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(IFI) {} + PartialInlinerImpl(InlineFunctionInfo IFI) : IFI(std::move(IFI)) {} bool run(Module &M); Function *unswitchFunction(Function *F); diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 941efb210d1c..f11b58d1adc4 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -93,10 +93,6 @@ static cl::opt<CFLAAType> clEnumValN(CFLAAType::Both, "both", "Enable both variants of CFL-AA"))); -static cl::opt<bool> -EnableMLSM("mlsm", cl::init(true), cl::Hidden, - cl::desc("Enable motion of merged load and store")); - static cl::opt<bool> EnableLoopInterchange( "enable-loopinterchange", cl::init(false), cl::Hidden, cl::desc("Enable the new, experimental LoopInterchange Pass")); @@ -141,8 +137,8 @@ static cl::opt<int> PreInlineThreshold( "(default = 75)")); static cl::opt<bool> EnableGVNHoist( - "enable-gvn-hoist", cl::init(false), cl::Hidden, - cl::desc("Enable the GVN hoisting pass")); + "enable-gvn-hoist", cl::init(true), cl::Hidden, + cl::desc("Enable the GVN hoisting pass (default = on)")); static cl::opt<bool> DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false), @@ -172,6 +168,7 @@ PassManagerBuilder::PassManagerBuilder() { PGOInstrUse = RunPGOInstrUse; PrepareForThinLTO = EnablePrepareForThinLTO; PerformThinLTO = false; + DivergentTarget = false; } PassManagerBuilder::~PassManagerBuilder() { @@ -248,8 +245,6 @@ void PassManagerBuilder::populateFunctionPassManager( FPM.add(createCFGSimplificationPass()); FPM.add(createSROAPass()); FPM.add(createEarlyCSEPass()); - if(EnableGVNHoist) - FPM.add(createGVNHoistPass()); FPM.add(createLowerExpectIntrinsicPass()); } @@ -294,6 +289,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // Break up aggregate allocas, using SSAUpdater. MPM.add(createSROAPass()); MPM.add(createEarlyCSEPass()); // Catch trivial redundancies + if (EnableGVNHoist) + MPM.add(createGVNHoistPass()); // Speculative execution if the target has divergent branches; otherwise nop. MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); MPM.add(createJumpThreadingPass()); // Thread jumps. @@ -305,29 +302,34 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createLibCallsShrinkWrapPass()); addExtensionsToPM(EP_Peephole, MPM); + // Optimize memory intrinsic calls based on the profiled size information. + if (SizeLevel == 0) + MPM.add(createPGOMemOPSizeOptLegacyPass()); + MPM.add(createTailCallEliminationPass()); // Eliminate tail calls MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); MPM.add(createLICMPass()); // Hoist loop invariants - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); + MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); MPM.add(createCFGSimplificationPass()); addInstructionCombiningPass(MPM); MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. + addExtensionsToPM(EP_LateLoopOptimizations, MPM); MPM.add(createLoopDeletionPass()); // Delete dead loops + if (EnableLoopInterchange) { MPM.add(createLoopInterchangePass()); // Interchange loops MPM.add(createCFGSimplificationPass()); } if (!DisableUnrollLoops) - MPM.add(createSimpleLoopUnrollPass()); // Unroll small loops + MPM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops addExtensionsToPM(EP_LoopOptimizerEnd, MPM); if (OptLevel > 1) { - if (EnableMLSM) - MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds + MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds MPM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies } @@ -369,7 +371,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // BBVectorize may have significantly shortened a loop body; unroll again. if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); + MPM.add(createLoopUnrollPass(OptLevel)); } } @@ -434,7 +436,16 @@ void PassManagerBuilder::populateModulePassManager( // earlier in the pass pipeline, here before globalopt. Otherwise imported // available_externally functions look unreferenced and are removed. if (PerformThinLTO) - MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true)); + MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true, + !PGOSampleUse.empty())); + + // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops + // as it will change the CFG too much to make the 2nd profile annotation + // in backend more difficult. + bool PrepareForThinLTOUsingPGOSampleProfile = + PrepareForThinLTO && !PGOSampleUse.empty(); + if (PrepareForThinLTOUsingPGOSampleProfile) + DisableUnrollLoops = true; if (!DisableUnitAtATime) { // Infer attributes about declarations if possible. @@ -454,14 +465,18 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE } - if (!PerformThinLTO) { + // For SamplePGO in ThinLTO compile phase, we do not want to do indirect + // call promotion as it will change the CFG too much to make the 2nd + // profile annotation in backend more difficult. + if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) { /// PGO instrumentation is added during the compile phase for ThinLTO, do /// not run it a second time addPGOInstrPasses(MPM); // Indirect call promotion that promotes intra-module targets only. // For ThinLTO this is done earlier due to interactions with globalopt // for imported functions. - MPM.add(createPGOIndirectCallPromotionLegacyPass()); + MPM.add( + createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty())); } if (EnableNonLTOGlobalsModRef) @@ -589,7 +604,7 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createCorrelatedValuePropagationPass()); addInstructionCombiningPass(MPM); MPM.add(createLICMPass()); - MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3)); + MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); MPM.add(createCFGSimplificationPass()); addInstructionCombiningPass(MPM); } @@ -615,16 +630,16 @@ void PassManagerBuilder::populateModulePassManager( // BBVectorize may have significantly shortened a loop body; unroll again. if (!DisableUnrollLoops) - MPM.add(createLoopUnrollPass()); + MPM.add(createLoopUnrollPass(OptLevel)); } } addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createCFGSimplificationPass()); + MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables addInstructionCombiningPass(MPM); if (!DisableUnrollLoops) { - MPM.add(createLoopUnrollPass()); // Unroll small loops + MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops // LoopUnroll may generate some redundency to cleanup. addInstructionCombiningPass(MPM); @@ -684,7 +699,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // left by the earlier promotion pass that promotes intra-module targets. // This two-step promotion is to save the compile time. For LTO, it should // produce the same result as if we only do promotion here. - PM.add(createPGOIndirectCallPromotionLegacyPass(true)); + PM.add( + createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty())); // Propagate constants at call sites into the functions they call. This // opens opportunities for globalopt (and inlining) by substituting function @@ -703,7 +719,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createGlobalSplitPass()); // Apply whole-program devirtualization and virtual constant propagation. - PM.add(createWholeProgramDevirtPass()); + PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr)); // That's all we need at opt level 1. if (OptLevel == 1) @@ -759,8 +775,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. PM.add(createLICMPass()); // Hoist loop invariants. - if (EnableMLSM) - PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds. + PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds. PM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. @@ -775,11 +790,11 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createLoopInterchangePass()); if (!DisableUnrollLoops) - PM.add(createSimpleLoopUnrollPass()); // Unroll small loops + PM.add(createSimpleLoopUnrollPass(OptLevel)); // Unroll small loops PM.add(createLoopVectorizePass(true, LoopVectorize)); // The vectorizer may have significantly shortened a loop body; unroll again. if (!DisableUnrollLoops) - PM.add(createLoopUnrollPass()); + PM.add(createLoopUnrollPass(OptLevel)); // Now that we've optimized loops (in particular loop induction variables), // we may have exposed more scalar opportunities. Run parts of the scalar @@ -833,6 +848,23 @@ void PassManagerBuilder::populateThinLTOPassManager( if (VerifyInput) PM.add(createVerifierPass()); + if (ImportSummary) { + // These passes import type identifier resolutions for whole-program + // devirtualization and CFI. They must run early because other passes may + // disturb the specific instruction patterns that these passes look for, + // creating dependencies on resolutions that may not appear in the summary. + // + // For example, GVN may transform the pattern assume(type.test) appearing in + // two basic blocks into assume(phi(type.test, type.test)), which would + // transform a dependency on a WPD resolution into a dependency on a type + // identifier resolution for CFI. + // + // Also, WPD has access to more precise information than ICP and can + // devirtualize more effectively, so it should operate on the IR first. + PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary)); + PM.add(createLowerTypeTestsPass(nullptr, ImportSummary)); + } + populateModulePassManager(PM); if (VerifyOutput) @@ -857,8 +889,7 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) { // Lower type metadata and the type.test intrinsic. This pass supports Clang's // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at // link time if CFI is enabled. The pass does nothing if CFI is disabled. - PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None, - /*Summary=*/nullptr)); + PM.add(createLowerTypeTestsPass(ExportSummary, nullptr)); if (OptLevel != 0) addLateLTOOptimizationPasses(PM); diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp index 6a43f8dbac48..3371de6e3d14 100644 --- a/lib/Transforms/IPO/SampleProfile.cpp +++ b/lib/Transforms/IPO/SampleProfile.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -43,6 +44,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -50,6 +52,7 @@ #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/Cloning.h" #include <cctype> @@ -159,8 +162,11 @@ protected: ErrorOr<uint64_t> getInstWeight(const Instruction &I); ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB); const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const; + std::vector<const FunctionSamples *> + findIndirectCallFunctionSamples(const Instruction &I) const; const FunctionSamples *findFunctionSamples(const Instruction &I) const; - bool inlineHotFunctions(Function &F); + bool inlineHotFunctions(Function &F, + DenseSet<GlobalValue::GUID> &ImportGUIDs); void printEdgeWeight(raw_ostream &OS, Edge E); void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); @@ -173,7 +179,7 @@ protected: void buildEdges(Function &F); bool propagateThroughEdges(Function &F, bool UpdateBlockCount); void computeDominanceAndLoopInfo(Function &F); - unsigned getOffset(unsigned L, unsigned H) const; + unsigned getOffset(const DILocation *DIL) const; void clearFunctionData(); /// \brief Map basic blocks to their computed weights. @@ -326,11 +332,12 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { // If there are inlined callsites in this function, count the samples found // in the respective bodies. However, do not bother counting callees with 0 // total samples, these are callees that were never invoked at runtime. - for (const auto &I : FS->getCallsiteSamples()) { - const FunctionSamples *CalleeSamples = &I.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countUsedRecords(CalleeSamples); - } + for (const auto &I : FS->getCallsiteSamples()) + for (const auto &J : I.second) { + const FunctionSamples *CalleeSamples = &J.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countUsedRecords(CalleeSamples); + } return Count; } @@ -343,11 +350,12 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { unsigned Count = FS->getBodySamples().size(); // Only count records in hot callsites. - for (const auto &I : FS->getCallsiteSamples()) { - const FunctionSamples *CalleeSamples = &I.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countBodyRecords(CalleeSamples); - } + for (const auto &I : FS->getCallsiteSamples()) + for (const auto &J : I.second) { + const FunctionSamples *CalleeSamples = &J.second; + if (callsiteIsHot(FS, CalleeSamples)) + Count += countBodyRecords(CalleeSamples); + } return Count; } @@ -362,11 +370,12 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { Total += I.second.getSamples(); // Only count samples in hot callsites. - for (const auto &I : FS->getCallsiteSamples()) { - const FunctionSamples *CalleeSamples = &I.second; - if (callsiteIsHot(FS, CalleeSamples)) - Total += countBodySamples(CalleeSamples); - } + for (const auto &I : FS->getCallsiteSamples()) + for (const auto &J : I.second) { + const FunctionSamples *CalleeSamples = &J.second; + if (callsiteIsHot(FS, CalleeSamples)) + Total += countBodySamples(CalleeSamples); + } return Total; } @@ -398,15 +407,11 @@ void SampleProfileLoader::clearFunctionData() { CoverageTracker.clear(); } -/// \brief Returns the offset of lineno \p L to head_lineno \p H -/// -/// \param L Lineno -/// \param H Header lineno of the function -/// -/// \returns offset to the header lineno. 16 bits are used to represent offset. +/// Returns the line offset to the start line of the subprogram. /// We assume that a single function will not exceed 65535 LOC. -unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const { - return (L - H) & 0xffff; +unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const { + return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & + 0xffff; } /// \brief Print the weight of edge \p E on stream \p OS. @@ -451,8 +456,7 @@ void SampleProfileLoader::printBlockWeight(raw_ostream &OS, /// \param Inst Instruction to query. /// /// \returns the weight of \p Inst. -ErrorOr<uint64_t> -SampleProfileLoader::getInstWeight(const Instruction &Inst) { +ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { const DebugLoc &DLoc = Inst.getDebugLoc(); if (!DLoc) return std::error_code(); @@ -470,19 +474,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) { // If a call/invoke instruction is inlined in profile, but not inlined here, // it means that the inlined callsite has no sample, thus the call // instruction should have 0 count. - bool IsCall = isa<CallInst>(Inst) || isa<InvokeInst>(Inst); - if (IsCall && findCalleeFunctionSamples(Inst)) + if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) && + findCalleeFunctionSamples(Inst)) return 0; const DILocation *DIL = DLoc; - unsigned Lineno = DLoc.getLine(); - unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine(); - - uint32_t LineOffset = getOffset(Lineno, HeaderLineno); - uint32_t Discriminator = DIL->getDiscriminator(); - ErrorOr<uint64_t> R = IsCall - ? FS->findCallSamplesAt(LineOffset, Discriminator) - : FS->findSamplesAt(LineOffset, Discriminator); + uint32_t LineOffset = getOffset(DIL); + uint32_t Discriminator = DIL->getBaseDiscriminator(); + ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator); if (R) { bool FirstMark = CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get()); @@ -491,13 +490,14 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) { LLVMContext &Ctx = F->getContext(); emitOptimizationRemark( Ctx, DEBUG_TYPE, *F, DLoc, - Twine("Applied ") + Twine(*R) + " samples from profile (offset: " + - Twine(LineOffset) + + Twine("Applied ") + Twine(*R) + + " samples from profile (offset: " + Twine(LineOffset) + ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")"); } - DEBUG(dbgs() << " " << Lineno << "." << DIL->getDiscriminator() << ":" - << Inst << " (line offset: " << Lineno - HeaderLineno << "." - << DIL->getDiscriminator() << " - weight: " << R.get() + DEBUG(dbgs() << " " << DLoc.getLine() << "." + << DIL->getBaseDiscriminator() << ":" << Inst + << " (line offset: " << LineOffset << "." + << DIL->getBaseDiscriminator() << " - weight: " << R.get() << ")\n"); } return R; @@ -511,8 +511,7 @@ SampleProfileLoader::getInstWeight(const Instruction &Inst) { /// \param BB The basic block to query. /// /// \returns the weight for \p BB. -ErrorOr<uint64_t> -SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { +ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) { uint64_t Max = 0; bool HasWeight = false; for (auto &I : BB->getInstList()) { @@ -565,16 +564,49 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { if (!DIL) { return nullptr; } - DISubprogram *SP = DIL->getScope()->getSubprogram(); - if (!SP) - return nullptr; + + StringRef CalleeName; + if (const CallInst *CI = dyn_cast<CallInst>(&Inst)) + if (Function *Callee = CI->getCalledFunction()) + CalleeName = Callee->getName(); const FunctionSamples *FS = findFunctionSamples(Inst); if (FS == nullptr) return nullptr; - return FS->findFunctionSamplesAt(LineLocation( - getOffset(DIL->getLine(), SP->getLine()), DIL->getDiscriminator())); + return FS->findFunctionSamplesAt( + LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), CalleeName); +} + +/// Returns a vector of FunctionSamples that are the indirect call targets +/// of \p Inst. The vector is sorted by the total number of samples. +std::vector<const FunctionSamples *> +SampleProfileLoader::findIndirectCallFunctionSamples( + const Instruction &Inst) const { + const DILocation *DIL = Inst.getDebugLoc(); + std::vector<const FunctionSamples *> R; + + if (!DIL) { + return R; + } + + const FunctionSamples *FS = findFunctionSamples(Inst); + if (FS == nullptr) + return R; + + if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt( + LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()))) { + if (M->size() == 0) + return R; + for (const auto &NameFS : *M) { + R.push_back(&NameFS.second); + } + std::sort(R.begin(), R.end(), + [](const FunctionSamples *L, const FunctionSamples *R) { + return L->getTotalSamples() > R->getTotalSamples(); + }); + } + return R; } /// \brief Get the FunctionSamples for an instruction. @@ -588,23 +620,23 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const { /// \returns the FunctionSamples pointer to the inlined instance. const FunctionSamples * SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { - SmallVector<LineLocation, 10> S; + SmallVector<std::pair<LineLocation, StringRef>, 10> S; const DILocation *DIL = Inst.getDebugLoc(); - if (!DIL) { + if (!DIL) return Samples; - } + + const DILocation *PrevDIL = DIL; for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) { - DISubprogram *SP = DIL->getScope()->getSubprogram(); - if (!SP) - return nullptr; - S.push_back(LineLocation(getOffset(DIL->getLine(), SP->getLine()), - DIL->getDiscriminator())); + S.push_back(std::make_pair( + LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()), + PrevDIL->getScope()->getSubprogram()->getLinkageName())); + PrevDIL = DIL; } if (S.size() == 0) return Samples; const FunctionSamples *FS = Samples; for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { - FS = FS->findFunctionSamplesAt(S[i]); + FS = FS->findFunctionSamplesAt(S[i].first, S[i].second); } return FS; } @@ -614,14 +646,17 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { /// Iteratively traverse all callsites of the function \p F, and find if /// the corresponding inlined instance exists and is hot in profile. If /// it is hot enough, inline the callsites and adds new callsites of the -/// callee into the caller. -/// -/// TODO: investigate the possibility of not invoking InlineFunction directly. +/// callee into the caller. If the call is an indirect call, first promote +/// it to direct call. Each indirect call is limited with a single target. /// /// \param F function to perform iterative inlining. +/// \param ImportGUIDs a set to be updated to include all GUIDs that come +/// from a different module but inlined in the profiled binary. /// /// \returns True if there is any inline happened. -bool SampleProfileLoader::inlineHotFunctions(Function &F) { +bool SampleProfileLoader::inlineHotFunctions( + Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) { + DenseSet<Instruction *> PromotedInsns; bool Changed = false; LLVMContext &Ctx = F.getContext(); std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&]( @@ -647,18 +682,42 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) { } for (auto I : CIS) { InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr); - CallSite CS(I); - Function *CalledFunction = CS.getCalledFunction(); - if (!CalledFunction || !CalledFunction->getSubprogram()) + Function *CalledFunction = CallSite(I).getCalledFunction(); + Instruction *DI = I; + if (!CalledFunction && !PromotedInsns.count(I) && + CallSite(I).isIndirectCall()) + for (const auto *FS : findIndirectCallFunctionSamples(*I)) { + auto CalleeFunctionName = FS->getName(); + const char *Reason = "Callee function not available"; + CalledFunction = F.getParent()->getFunction(CalleeFunctionName); + if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) { + // The indirect target was promoted and inlined in the profile, as a + // result, we do not have profile info for the branch probability. + // We set the probability to 80% taken to indicate that the static + // call is likely taken. + DI = dyn_cast<Instruction>( + promoteIndirectCall(I, CalledFunction, 80, 100, false) + ->stripPointerCasts()); + PromotedInsns.insert(I); + } else { + DEBUG(dbgs() << "\nFailed to promote indirect call to " + << CalleeFunctionName << " because " << Reason + << "\n"); + continue; + } + } + if (!CalledFunction || !CalledFunction->getSubprogram()) { + findCalleeFunctionSamples(*I)->findImportedFunctions( + ImportGUIDs, F.getParent(), + Samples->getTotalSamples() * SampleProfileHotThreshold / 100); continue; + } DebugLoc DLoc = I->getDebugLoc(); - uint64_t NumSamples = findCalleeFunctionSamples(*I)->getTotalSamples(); - if (InlineFunction(CS, IFI)) { + if (InlineFunction(CallSite(DI), IFI)) { LocalChanged = true; emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc, Twine("inlined hot callee '") + - CalledFunction->getName() + "' with " + - Twine(NumSamples) + " samples into '" + + CalledFunction->getName() + "' into '" + F.getName() + "'"); } } @@ -994,6 +1053,26 @@ void SampleProfileLoader::buildEdges(Function &F) { } } +/// Sorts the CallTargetMap \p M by count in descending order and stores the +/// sorted result in \p Sorted. Returns the total counts. +static uint64_t SortCallTargets(SmallVector<InstrProfValueData, 2> &Sorted, + const SampleRecord::CallTargetMap &M) { + Sorted.clear(); + uint64_t Sum = 0; + for (auto I = M.begin(); I != M.end(); ++I) { + Sum += I->getValue(); + Sorted.push_back({Function::getGUID(I->getKey()), I->getValue()}); + } + std::sort(Sorted.begin(), Sorted.end(), + [](const InstrProfValueData &L, const InstrProfValueData &R) { + if (L.Count == R.Count) + return L.Value > R.Value; + else + return L.Count > R.Count; + }); + return Sum; +} + /// \brief Propagate weights into edges /// /// The following rules are applied to every block BB in the CFG: @@ -1015,10 +1094,6 @@ void SampleProfileLoader::propagateWeights(Function &F) { bool Changed = true; unsigned I = 0; - // Add an entry count to the function using the samples gathered - // at the function entry. - F.setEntryCount(Samples->getHeadSamples() + 1); - // If BB weight is larger than its corresponding loop's header BB weight, // use the BB weight to replace the loop header BB weight. for (auto &BI : F) { @@ -1071,13 +1146,32 @@ void SampleProfileLoader::propagateWeights(Function &F) { if (BlockWeights[BB]) { for (auto &I : BB->getInstList()) { - if (CallInst *CI = dyn_cast<CallInst>(&I)) { - if (!dyn_cast<IntrinsicInst>(&I)) { - SmallVector<uint32_t, 1> Weights; - Weights.push_back(BlockWeights[BB]); - CI->setMetadata(LLVMContext::MD_prof, - MDB.createBranchWeights(Weights)); - } + if (!isa<CallInst>(I) && !isa<InvokeInst>(I)) + continue; + CallSite CS(&I); + if (!CS.getCalledFunction()) { + const DebugLoc &DLoc = I.getDebugLoc(); + if (!DLoc) + continue; + const DILocation *DIL = DLoc; + uint32_t LineOffset = getOffset(DIL); + uint32_t Discriminator = DIL->getBaseDiscriminator(); + + const FunctionSamples *FS = findFunctionSamples(I); + if (!FS) + continue; + auto T = FS->findCallTargetMapAt(LineOffset, Discriminator); + if (!T || T.get().size() == 0) + continue; + SmallVector<InstrProfValueData, 2> SortedCallTargets; + uint64_t Sum = SortCallTargets(SortedCallTargets, T.get()); + annotateValueSite(*I.getParent()->getParent()->getParent(), I, + SortedCallTargets, Sum, IPVK_IndirectCallTarget, + SortedCallTargets.size()); + } else if (!dyn_cast<IntrinsicInst>(&I)) { + SmallVector<uint32_t, 1> Weights; + Weights.push_back(BlockWeights[BB]); + I.setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); } } } @@ -1115,9 +1209,13 @@ void SampleProfileLoader::propagateWeights(Function &F) { } } + uint64_t TempWeight; // Only set weights if there is at least one non-zero weight. // In any other case, let the analyzer set weights. - if (MaxWeight > 0) { + // Do not set weights if the weights are present. In ThinLTO, the profile + // annotation is done twice. If the first annotation already set the + // weights, the second pass does not need to set it. + if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) { DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n"); TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); @@ -1228,12 +1326,19 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { DEBUG(dbgs() << "Line number for the first instruction in " << F.getName() << ": " << getFunctionLoc(F) << "\n"); - Changed |= inlineHotFunctions(F); + DenseSet<GlobalValue::GUID> ImportGUIDs; + Changed |= inlineHotFunctions(F, ImportGUIDs); // Compute basic block weights. Changed |= computeBlockWeights(F); if (Changed) { + // Add an entry count to the function using the samples gathered at the + // function entry. Also sets the GUIDs that comes from a different + // module but inlined in the profiled binary. This is aiming at making + // the IR match the profiled binary before annotation. + F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs); + // Compute dominance and loop info needed for propagation. computeDominanceAndLoopInfo(F); @@ -1329,7 +1434,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { bool SampleProfileLoader::runOnFunction(Function &F) { F.setEntryCount(0); Samples = Reader->getSamplesFor(F); - if (!Samples->empty()) + if (Samples && !Samples->empty()) return emitAnnotations(F); return false; } diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp index 8f6f161428e8..fb64367eef91 100644 --- a/lib/Transforms/IPO/StripSymbols.cpp +++ b/lib/Transforms/IPO/StripSymbols.cpp @@ -323,6 +323,14 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { LiveGVs.insert(GVE); } + std::set<DICompileUnit *> LiveCUs; + // Any CU referenced from a subprogram is live. + for (DISubprogram *SP : F.subprograms()) { + if (SP->getUnit()) + LiveCUs.insert(SP->getUnit()); + } + + bool HasDeadCUs = false; for (DICompileUnit *DIC : F.compile_units()) { // Create our live global variable list. bool GlobalVariableChange = false; @@ -341,6 +349,11 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { GlobalVariableChange = true; } + if (!LiveGlobalVariables.empty()) + LiveCUs.insert(DIC); + else if (!LiveCUs.count(DIC)) + HasDeadCUs = true; + // If we found dead global variables, replace the current global // variable list with our new live global variable list. if (GlobalVariableChange) { @@ -352,5 +365,16 @@ bool StripDeadDebugInfo::runOnModule(Module &M) { LiveGlobalVariables.clear(); } + if (HasDeadCUs) { + // Delete the old node and replace it with a new one + NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu"); + NMD->clearOperands(); + if (!LiveCUs.empty()) { + for (DICompileUnit *CU : LiveCUs) + NMD->addOperand(CU); + } + Changed = true; + } + return Changed; } diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 3680cfc813a1..65deb82cd2a5 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -14,16 +14,21 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/IPO.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; @@ -41,23 +46,14 @@ namespace { std::string getModuleId(Module *M) { MD5 Md5; bool ExportsSymbols = false; - auto AddGlobal = [&](GlobalValue &GV) { + for (auto &GV : M->global_values()) { if (GV.isDeclaration() || GV.getName().startswith("llvm.") || !GV.hasExternalLinkage()) - return; + continue; ExportsSymbols = true; Md5.update(GV.getName()); Md5.update(ArrayRef<uint8_t>{0}); - }; - - for (auto &F : *M) - AddGlobal(F); - for (auto &GV : M->globals()) - AddGlobal(GV); - for (auto &GA : M->aliases()) - AddGlobal(GA); - for (auto &IF : M->ifuncs()) - AddGlobal(IF); + } if (!ExportsSymbols) return ""; @@ -73,15 +69,21 @@ std::string getModuleId(Module *M) { // Promote each local-linkage entity defined by ExportM and used by ImportM by // changing visibility and appending the given ModuleId. void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) { - auto PromoteInternal = [&](GlobalValue &ExportGV) { + DenseMap<const Comdat *, Comdat *> RenamedComdats; + for (auto &ExportGV : ExportM.global_values()) { if (!ExportGV.hasLocalLinkage()) - return; + continue; - GlobalValue *ImportGV = ImportM.getNamedValue(ExportGV.getName()); + auto Name = ExportGV.getName(); + GlobalValue *ImportGV = ImportM.getNamedValue(Name); if (!ImportGV || ImportGV->use_empty()) - return; + continue; + + std::string NewName = (Name + ModuleId).str(); - std::string NewName = (ExportGV.getName() + ModuleId).str(); + if (const auto *C = ExportGV.getComdat()) + if (C->getName() == Name) + RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName)); ExportGV.setName(NewName); ExportGV.setLinkage(GlobalValue::ExternalLinkage); @@ -89,16 +91,15 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) { ImportGV->setName(NewName); ImportGV->setVisibility(GlobalValue::HiddenVisibility); - }; + } - for (auto &F : ExportM) - PromoteInternal(F); - for (auto &GV : ExportM.globals()) - PromoteInternal(GV); - for (auto &GA : ExportM.aliases()) - PromoteInternal(GA); - for (auto &IF : ExportM.ifuncs()) - PromoteInternal(IF); + if (!RenamedComdats.empty()) + for (auto &GO : ExportM.global_objects()) + if (auto *C = GO.getComdat()) { + auto Replacement = RenamedComdats.find(C); + if (Replacement != RenamedComdats.end()) + GO.setComdat(Replacement->second); + } } // Promote all internal (i.e. distinct) type ids used by the module by replacing @@ -194,24 +195,7 @@ void simplifyExternals(Module &M) { } void filterModule( - Module *M, std::function<bool(const GlobalValue *)> ShouldKeepDefinition) { - for (Function &F : *M) { - if (ShouldKeepDefinition(&F)) - continue; - - F.deleteBody(); - F.clearMetadata(); - } - - for (GlobalVariable &GV : M->globals()) { - if (ShouldKeepDefinition(&GV)) - continue; - - GV.setInitializer(nullptr); - GV.setLinkage(GlobalValue::ExternalLinkage); - GV.clearMetadata(); - } - + Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) { for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end(); I != E;) { GlobalAlias *GA = &*I++; @@ -219,7 +203,7 @@ void filterModule( continue; GlobalObject *GO; - if (I->getValueType()->isFunctionTy()) + if (GA->getValueType()->isFunctionTy()) GO = Function::Create(cast<FunctionType>(GA->getValueType()), GlobalValue::ExternalLinkage, "", M); else @@ -231,53 +215,168 @@ void filterModule( GA->replaceAllUsesWith(GO); GA->eraseFromParent(); } + + for (Function &F : *M) { + if (ShouldKeepDefinition(&F)) + continue; + + F.deleteBody(); + F.setComdat(nullptr); + F.clearMetadata(); + } + + for (GlobalVariable &GV : M->globals()) { + if (ShouldKeepDefinition(&GV)) + continue; + + GV.setInitializer(nullptr); + GV.setLinkage(GlobalValue::ExternalLinkage); + GV.setComdat(nullptr); + GV.clearMetadata(); + } +} + +void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) { + if (auto *F = dyn_cast<Function>(C)) + return Fn(F); + if (isa<GlobalValue>(C)) + return; + for (Value *Op : C->operands()) + forEachVirtualFunction(cast<Constant>(Op), Fn); } // If it's possible to split M into regular and thin LTO parts, do so and write // a multi-module bitcode file with the two parts to OS. Otherwise, write only a // regular LTO bitcode file to OS. -void splitAndWriteThinLTOBitcode(raw_ostream &OS, Module &M) { +void splitAndWriteThinLTOBitcode( + raw_ostream &OS, raw_ostream *ThinLinkOS, + function_ref<AAResults &(Function &)> AARGetter, Module &M) { std::string ModuleId = getModuleId(&M); if (ModuleId.empty()) { // We couldn't generate a module ID for this module, just write it out as a // regular LTO module. WriteBitcodeToFile(&M, OS); + if (ThinLinkOS) + // We don't have a ThinLTO part, but still write the module to the + // ThinLinkOS if requested so that the expected output file is produced. + WriteBitcodeToFile(&M, *ThinLinkOS); return; } promoteTypeIds(M, ModuleId); - auto IsInMergedM = [&](const GlobalValue *GV) { - auto *GVar = dyn_cast<GlobalVariable>(GV->getBaseObject()); - if (!GVar) - return false; - + // Returns whether a global has attached type metadata. Such globals may + // participate in CFI or whole-program devirtualization, so they need to + // appear in the merged module instead of the thin LTO module. + auto HasTypeMetadata = [&](const GlobalObject *GO) { SmallVector<MDNode *, 1> MDs; - GVar->getMetadata(LLVMContext::MD_type, MDs); + GO->getMetadata(LLVMContext::MD_type, MDs); return !MDs.empty(); }; + // Collect the set of virtual functions that are eligible for virtual constant + // propagation. Each eligible function must not access memory, must return + // an integer of width <=64 bits, must take at least one argument, must not + // use its first argument (assumed to be "this") and all arguments other than + // the first one must be of <=64 bit integer type. + // + // Note that we test whether this copy of the function is readnone, rather + // than testing function attributes, which must hold for any copy of the + // function, even a less optimized version substituted at link time. This is + // sound because the virtual constant propagation optimizations effectively + // inline all implementations of the virtual function into each call site, + // rather than using function attributes to perform local optimization. + std::set<const Function *> EligibleVirtualFns; + // If any member of a comdat lives in MergedM, put all members of that + // comdat in MergedM to keep the comdat together. + DenseSet<const Comdat *> MergedMComdats; + for (GlobalVariable &GV : M.globals()) + if (HasTypeMetadata(&GV)) { + if (const auto *C = GV.getComdat()) + MergedMComdats.insert(C); + forEachVirtualFunction(GV.getInitializer(), [&](Function *F) { + auto *RT = dyn_cast<IntegerType>(F->getReturnType()); + if (!RT || RT->getBitWidth() > 64 || F->arg_empty() || + !F->arg_begin()->use_empty()) + return; + for (auto &Arg : make_range(std::next(F->arg_begin()), F->arg_end())) { + auto *ArgT = dyn_cast<IntegerType>(Arg.getType()); + if (!ArgT || ArgT->getBitWidth() > 64) + return; + } + if (computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone) + EligibleVirtualFns.insert(F); + }); + } + ValueToValueMapTy VMap; - std::unique_ptr<Module> MergedM(CloneModule(&M, VMap, IsInMergedM)); + std::unique_ptr<Module> MergedM( + CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool { + if (const auto *C = GV->getComdat()) + if (MergedMComdats.count(C)) + return true; + if (auto *F = dyn_cast<Function>(GV)) + return EligibleVirtualFns.count(F); + if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) + return HasTypeMetadata(GVar); + return false; + })); + StripDebugInfo(*MergedM); + + for (Function &F : *MergedM) + if (!F.isDeclaration()) { + // Reset the linkage of all functions eligible for virtual constant + // propagation. The canonical definitions live in the thin LTO module so + // that they can be imported. + F.setLinkage(GlobalValue::AvailableExternallyLinkage); + F.setComdat(nullptr); + } - filterModule(&M, [&](const GlobalValue *GV) { return !IsInMergedM(GV); }); + // Remove all globals with type metadata, globals with comdats that live in + // MergedM, and aliases pointing to such globals from the thin LTO module. + filterModule(&M, [&](const GlobalValue *GV) { + if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject())) + if (HasTypeMetadata(GVar)) + return false; + if (const auto *C = GV->getComdat()) + if (MergedMComdats.count(C)) + return false; + return true; + }); promoteInternals(*MergedM, M, ModuleId); promoteInternals(M, *MergedM, ModuleId); simplifyExternals(*MergedM); - SmallVector<char, 0> Buffer; - BitcodeWriter W(Buffer); // FIXME: Try to re-use BSI and PFI from the original module here. ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr); - W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index, - /*GenerateHash=*/true); - W.writeModule(MergedM.get()); + SmallVector<char, 0> Buffer; + BitcodeWriter W(Buffer); + // Save the module hash produced for the full bitcode, which will + // be used in the backends, and use that in the minimized bitcode + // produced for the full link. + ModuleHash ModHash = {{0}}; + W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index, + /*GenerateHash=*/true, &ModHash); + W.writeModule(MergedM.get()); OS << Buffer; + + // If a minimized bitcode module was requested for the thin link, + // strip the debug info (the merged module was already stripped above) + // and write it to the given OS. + if (ThinLinkOS) { + Buffer.clear(); + BitcodeWriter W2(Buffer); + StripDebugInfo(M); + W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index, + /*GenerateHash=*/false, &ModHash); + W2.writeModule(MergedM.get()); + *ThinLinkOS << Buffer; + } } // Returns whether this module needs to be split because it uses type metadata. @@ -292,28 +391,45 @@ bool requiresSplit(Module &M) { return false; } -void writeThinLTOBitcode(raw_ostream &OS, Module &M, - const ModuleSummaryIndex *Index) { +void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, + function_ref<AAResults &(Function &)> AARGetter, + Module &M, const ModuleSummaryIndex *Index) { // See if this module has any type metadata. If so, we need to split it. if (requiresSplit(M)) - return splitAndWriteThinLTOBitcode(OS, M); + return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M); // Otherwise we can just write it out as a regular module. + + // Save the module hash produced for the full bitcode, which will + // be used in the backends, and use that in the minimized bitcode + // produced for the full link. + ModuleHash ModHash = {{0}}; WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index, - /*GenerateHash=*/true); + /*GenerateHash=*/true, &ModHash); + // If a minimized bitcode module was requested for the thin link, + // strip the debug info and write it to the given OS. + if (ThinLinkOS) { + StripDebugInfo(M); + WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false, + Index, + /*GenerateHash=*/false, &ModHash); + } } class WriteThinLTOBitcode : public ModulePass { raw_ostream &OS; // raw_ostream to print on + // The output stream on which to emit a minimized module for use + // just in the thin link, if requested. + raw_ostream *ThinLinkOS; public: static char ID; // Pass identification, replacement for typeid - WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) { + WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) { initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); } - explicit WriteThinLTOBitcode(raw_ostream &o) - : ModulePass(ID), OS(o) { + explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS) + : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) { initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry()); } @@ -322,12 +438,14 @@ public: bool runOnModule(Module &M) override { const ModuleSummaryIndex *Index = &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex()); - writeThinLTOBitcode(OS, M, Index); + writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index); return true; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<ModuleSummaryIndexWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } // anonymous namespace @@ -335,10 +453,13 @@ public: char WriteThinLTOBitcode::ID = 0; INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode", "Write ThinLTO Bitcode", false, true) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode", "Write ThinLTO Bitcode", false, true) -ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str) { - return new WriteThinLTOBitcode(Str); +ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str, + raw_ostream *ThinLinkOS) { + return new WriteThinLTOBitcode(Str, ThinLinkOS); } diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp index 844cc0f70eed..cb7d487b68b0 100644 --- a/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -25,6 +25,20 @@ // returns 0, or a single vtable's function returns 1, replace each virtual // call with a comparison of the vptr against that vtable's address. // +// This pass is intended to be used during the regular and thin LTO pipelines. +// During regular LTO, the pass determines the best optimization for each +// virtual call and applies the resolutions directly to virtual calls that are +// eligible for virtual call optimization (i.e. calls that use either of the +// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During +// ThinLTO, the pass operates in two phases: +// - Export phase: this is run during the thin link over a single merged module +// that contains all vtables with !type metadata that participate in the link. +// The pass computes a resolution for each virtual call and stores it in the +// type identifier summary. +// - Import phase: this is run during the thin backends over the individual +// modules. The pass applies the resolutions previously computed during the +// import phase to each eligible virtual call. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/WholeProgramDevirt.h" @@ -35,6 +49,8 @@ #include "llvm/ADT/iterator_range.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -54,12 +70,16 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/ModuleSummaryIndexYAML.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/MathExtras.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/Utils/Evaluator.h" #include <algorithm> #include <cstddef> @@ -72,6 +92,26 @@ using namespace wholeprogramdevirt; #define DEBUG_TYPE "wholeprogramdevirt" +static cl::opt<PassSummaryAction> ClSummaryAction( + "wholeprogramdevirt-summary-action", + cl::desc("What to do with the summary when running this pass"), + cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"), + clEnumValN(PassSummaryAction::Import, "import", + "Import typeid resolutions from summary and globals"), + clEnumValN(PassSummaryAction::Export, "export", + "Export typeid resolutions to summary and globals")), + cl::Hidden); + +static cl::opt<std::string> ClReadSummary( + "wholeprogramdevirt-read-summary", + cl::desc("Read summary from given YAML file before running pass"), + cl::Hidden); + +static cl::opt<std::string> ClWriteSummary( + "wholeprogramdevirt-write-summary", + cl::desc("Write summary to given YAML file after running pass"), + cl::Hidden); + // Find the minimum offset that we may store a value of size Size bits at. If // IsAfter is set, look for an offset before the object, otherwise look for an // offset after the object. @@ -259,15 +299,92 @@ struct VirtualCallSite { } }; +// Call site information collected for a specific VTableSlot and possibly a list +// of constant integer arguments. The grouping by arguments is handled by the +// VTableSlotInfo class. +struct CallSiteInfo { + /// The set of call sites for this slot. Used during regular LTO and the + /// import phase of ThinLTO (as well as the export phase of ThinLTO for any + /// call sites that appear in the merged module itself); in each of these + /// cases we are directly operating on the call sites at the IR level. + std::vector<VirtualCallSite> CallSites; + + // These fields are used during the export phase of ThinLTO and reflect + // information collected from function summaries. + + /// Whether any function summary contains an llvm.assume(llvm.type.test) for + /// this slot. + bool SummaryHasTypeTestAssumeUsers; + + /// CFI-specific: a vector containing the list of function summaries that use + /// the llvm.type.checked.load intrinsic and therefore will require + /// resolutions for llvm.type.test in order to implement CFI checks if + /// devirtualization was unsuccessful. If devirtualization was successful, the + /// pass will clear this vector by calling markDevirt(). If at the end of the + /// pass the vector is non-empty, we will need to add a use of llvm.type.test + /// to each of the function summaries in the vector. + std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers; + + bool isExported() const { + return SummaryHasTypeTestAssumeUsers || + !SummaryTypeCheckedLoadUsers.empty(); + } + + /// As explained in the comment for SummaryTypeCheckedLoadUsers. + void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); } +}; + +// Call site information collected for a specific VTableSlot. +struct VTableSlotInfo { + // The set of call sites which do not have all constant integer arguments + // (excluding "this"). + CallSiteInfo CSInfo; + + // The set of call sites with all constant integer arguments (excluding + // "this"), grouped by argument list. + std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo; + + void addCallSite(Value *VTable, CallSite CS, unsigned *NumUnsafeUses); + +private: + CallSiteInfo &findCallSiteInfo(CallSite CS); +}; + +CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) { + std::vector<uint64_t> Args; + auto *CI = dyn_cast<IntegerType>(CS.getType()); + if (!CI || CI->getBitWidth() > 64 || CS.arg_empty()) + return CSInfo; + for (auto &&Arg : make_range(CS.arg_begin() + 1, CS.arg_end())) { + auto *CI = dyn_cast<ConstantInt>(Arg); + if (!CI || CI->getBitWidth() > 64) + return CSInfo; + Args.push_back(CI->getZExtValue()); + } + return ConstCSInfo[Args]; +} + +void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS, + unsigned *NumUnsafeUses) { + findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses}); +} + struct DevirtModule { Module &M; + function_ref<AAResults &(Function &)> AARGetter; + + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; + IntegerType *Int8Ty; PointerType *Int8PtrTy; IntegerType *Int32Ty; + IntegerType *Int64Ty; + IntegerType *IntPtrTy; bool RemarksEnabled; - MapVector<VTableSlot, std::vector<VirtualCallSite>> CallSlots; + MapVector<VTableSlot, VTableSlotInfo> CallSlots; // This map keeps track of the number of "unsafe" uses of a loaded function // pointer. The key is the associated llvm.type.test intrinsic call generated @@ -279,11 +396,18 @@ struct DevirtModule { // true. std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest; - DevirtModule(Module &M) - : M(M), Int8Ty(Type::getInt8Ty(M.getContext())), + DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter, + ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary), + ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())), Int8PtrTy(Type::getInt8PtrTy(M.getContext())), Int32Ty(Type::getInt32Ty(M.getContext())), - RemarksEnabled(areRemarksEnabled()) {} + Int64Ty(Type::getInt64Ty(M.getContext())), + IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)), + RemarksEnabled(areRemarksEnabled()) { + assert(!(ExportSummary && ImportSummary)); + } bool areRemarksEnabled(); @@ -298,57 +422,169 @@ struct DevirtModule { tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot, const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset); + + void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, + bool &IsExported); bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites); + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res); + bool tryEvaluateFunctionsWithArgs( MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<ConstantInt *> Args); - bool tryUniformRetValOpt(IntegerType *RetType, - MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites); + ArrayRef<uint64_t> Args); + + void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, + uint64_t TheRetVal); + bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot, + CallSiteInfo &CSInfo, + WholeProgramDevirtResolution::ByArg *Res); + + // Returns the global symbol name that is used to export information about the + // given vtable slot and list of arguments. + std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name); + + // This function is called during the export phase to create a symbol + // definition containing information about the given vtable slot and list of + // arguments. + void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name, + Constant *C); + + // This function is called during the import phase to create a reference to + // the symbol definition created during the export phase. + Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, unsigned AbsWidth = 0); + + void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne, + Constant *UniqueMemberAddr); bool tryUniqueRetValOpt(unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites); + CallSiteInfo &CSInfo, + WholeProgramDevirtResolution::ByArg *Res, + VTableSlot Slot, ArrayRef<uint64_t> Args); + + void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName, + Constant *Byte, Constant *Bit); bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<VirtualCallSite> CallSites); + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, VTableSlot Slot); void rebuildGlobal(VTableBits &B); + // Apply the summary resolution for Slot to all virtual calls in SlotInfo. + void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo); + + // If we were able to eliminate all unsafe uses for a type checked load, + // eliminate the associated type tests by replacing them with true. + void removeRedundantTypeTests(); + bool run(); + + // Lower the module using the action and summary passed as command line + // arguments. For testing purposes only. + static bool runForTesting(Module &M, + function_ref<AAResults &(Function &)> AARGetter); }; struct WholeProgramDevirt : public ModulePass { static char ID; - WholeProgramDevirt() : ModulePass(ID) { + bool UseCommandLine = false; + + ModuleSummaryIndex *ExportSummary; + const ModuleSummaryIndex *ImportSummary; + + WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) { + initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); + } + + WholeProgramDevirt(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) + : ModulePass(ID), ExportSummary(ExportSummary), + ImportSummary(ImportSummary) { initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); } bool runOnModule(Module &M) override { if (skipModule(M)) return false; + if (UseCommandLine) + return DevirtModule::runForTesting(M, LegacyAARGetter(*this)); + return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary) + .run(); + } - return DevirtModule(M).run(); + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; } // end anonymous namespace -INITIALIZE_PASS(WholeProgramDevirt, "wholeprogramdevirt", - "Whole program devirtualization", false, false) +INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt", + "Whole program devirtualization", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt", + "Whole program devirtualization", false, false) char WholeProgramDevirt::ID = 0; -ModulePass *llvm::createWholeProgramDevirtPass() { - return new WholeProgramDevirt; +ModulePass * +llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary, + const ModuleSummaryIndex *ImportSummary) { + return new WholeProgramDevirt(ExportSummary, ImportSummary); } PreservedAnalyses WholeProgramDevirtPass::run(Module &M, - ModuleAnalysisManager &) { - if (!DevirtModule(M).run()) + ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + auto AARGetter = [&](Function &F) -> AAResults & { + return FAM.getResult<AAManager>(F); + }; + if (!DevirtModule(M, AARGetter, nullptr, nullptr).run()) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } +bool DevirtModule::runForTesting( + Module &M, function_ref<AAResults &(Function &)> AARGetter) { + ModuleSummaryIndex Summary; + + // Handle the command-line summary arguments. This code is for testing + // purposes only, so we handle errors directly. + if (!ClReadSummary.empty()) { + ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary + + ": "); + auto ReadSummaryFile = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary))); + + yaml::Input In(ReadSummaryFile->getBuffer()); + In >> Summary; + ExitOnErr(errorCodeToError(In.error())); + } + + bool Changed = + DevirtModule( + M, AARGetter, + ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr, + ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr) + .run(); + + if (!ClWriteSummary.empty()) { + ExitOnError ExitOnErr( + "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": "); + std::error_code EC; + raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text); + ExitOnErr(errorCodeToError(EC)); + + yaml::Output Out(OS); + Out << Summary; + } + + return Changed; +} + void DevirtModule::buildTypeIdentifierMap( std::vector<VTableBits> &Bits, DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) { @@ -443,9 +679,31 @@ bool DevirtModule::tryFindVirtualCallTargets( return !TargetsForSlot.empty(); } +void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, + Constant *TheFn, bool &IsExported) { + auto Apply = [&](CallSiteInfo &CSInfo) { + for (auto &&VCallSite : CSInfo.CallSites) { + if (RemarksEnabled) + VCallSite.emitRemark("single-impl", TheFn->getName()); + VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast( + TheFn, VCallSite.CS.getCalledValue()->getType())); + // This use is no longer unsafe. + if (VCallSite.NumUnsafeUses) + --*VCallSite.NumUnsafeUses; + } + if (CSInfo.isExported()) { + IsExported = true; + CSInfo.markDevirt(); + } + }; + Apply(SlotInfo.CSInfo); + for (auto &P : SlotInfo.ConstCSInfo) + Apply(P.second); +} + bool DevirtModule::trySingleImplDevirt( MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites) { + VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) { // See if the program contains a single implementation of this virtual // function. Function *TheFn = TargetsForSlot[0].Fn; @@ -453,39 +711,51 @@ bool DevirtModule::trySingleImplDevirt( if (TheFn != Target.Fn) return false; + // If so, update each call site to call that implementation directly. if (RemarksEnabled) TargetsForSlot[0].WasDevirt = true; - // If so, update each call site to call that implementation directly. - for (auto &&VCallSite : CallSites) { - if (RemarksEnabled) - VCallSite.emitRemark("single-impl", TheFn->getName()); - VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast( - TheFn, VCallSite.CS.getCalledValue()->getType())); - // This use is no longer unsafe. - if (VCallSite.NumUnsafeUses) - --*VCallSite.NumUnsafeUses; + + bool IsExported = false; + applySingleImplDevirt(SlotInfo, TheFn, IsExported); + if (!IsExported) + return false; + + // If the only implementation has local linkage, we must promote to external + // to make it visible to thin LTO objects. We can only get here during the + // ThinLTO export phase. + if (TheFn->hasLocalLinkage()) { + TheFn->setLinkage(GlobalValue::ExternalLinkage); + TheFn->setVisibility(GlobalValue::HiddenVisibility); + TheFn->setName(TheFn->getName() + "$merged"); } + + Res->TheKind = WholeProgramDevirtResolution::SingleImpl; + Res->SingleImplName = TheFn->getName(); + return true; } bool DevirtModule::tryEvaluateFunctionsWithArgs( MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<ConstantInt *> Args) { + ArrayRef<uint64_t> Args) { // Evaluate each function and store the result in each target's RetVal // field. for (VirtualCallTarget &Target : TargetsForSlot) { if (Target.Fn->arg_size() != Args.size() + 1) return false; - for (unsigned I = 0; I != Args.size(); ++I) - if (Target.Fn->getFunctionType()->getParamType(I + 1) != - Args[I]->getType()) - return false; Evaluator Eval(M.getDataLayout(), nullptr); SmallVector<Constant *, 2> EvalArgs; EvalArgs.push_back( Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0))); - EvalArgs.insert(EvalArgs.end(), Args.begin(), Args.end()); + for (unsigned I = 0; I != Args.size(); ++I) { + auto *ArgTy = dyn_cast<IntegerType>( + Target.Fn->getFunctionType()->getParamType(I + 1)); + if (!ArgTy) + return false; + EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I])); + } + Constant *RetVal; if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) || !isa<ConstantInt>(RetVal)) @@ -495,9 +765,18 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs( return true; } +void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, + uint64_t TheRetVal) { + for (auto Call : CSInfo.CallSites) + Call.replaceAndErase( + "uniform-ret-val", FnName, RemarksEnabled, + ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal)); + CSInfo.markDevirt(); +} + bool DevirtModule::tryUniformRetValOpt( - IntegerType *RetType, MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites) { + MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo, + WholeProgramDevirtResolution::ByArg *Res) { // Uniform return value optimization. If all functions return the same // constant, replace all calls with that constant. uint64_t TheRetVal = TargetsForSlot[0].RetVal; @@ -505,19 +784,77 @@ bool DevirtModule::tryUniformRetValOpt( if (Target.RetVal != TheRetVal) return false; - auto TheRetValConst = ConstantInt::get(RetType, TheRetVal); - for (auto Call : CallSites) - Call.replaceAndErase("uniform-ret-val", TargetsForSlot[0].Fn->getName(), - RemarksEnabled, TheRetValConst); + if (CSInfo.isExported()) { + Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal; + Res->Info = TheRetVal; + } + + applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal); if (RemarksEnabled) for (auto &&Target : TargetsForSlot) Target.WasDevirt = true; return true; } +std::string DevirtModule::getGlobalName(VTableSlot Slot, + ArrayRef<uint64_t> Args, + StringRef Name) { + std::string FullName = "__typeid_"; + raw_string_ostream OS(FullName); + OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset; + for (uint64_t Arg : Args) + OS << '_' << Arg; + OS << '_' << Name; + return OS.str(); +} + +void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, Constant *C) { + GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage, + getGlobalName(Slot, Args, Name), C, &M); + GA->setVisibility(GlobalValue::HiddenVisibility); +} + +Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, + StringRef Name, unsigned AbsWidth) { + Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty); + auto *GV = dyn_cast<GlobalVariable>(C); + // We only need to set metadata if the global is newly created, in which + // case it would not have hidden visibility. + if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility) + return C; + + GV->setVisibility(GlobalValue::HiddenVisibility); + auto SetAbsRange = [&](uint64_t Min, uint64_t Max) { + auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min)); + auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max)); + GV->setMetadata(LLVMContext::MD_absolute_symbol, + MDNode::get(M.getContext(), {MinC, MaxC})); + }; + if (AbsWidth == IntPtrTy->getBitWidth()) + SetAbsRange(~0ull, ~0ull); // Full set. + else if (AbsWidth) + SetAbsRange(0, 1ull << AbsWidth); + return GV; +} + +void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, + bool IsOne, + Constant *UniqueMemberAddr) { + for (auto &&Call : CSInfo.CallSites) { + IRBuilder<> B(Call.CS.getInstruction()); + Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, + Call.VTable, UniqueMemberAddr); + Cmp = B.CreateZExt(Cmp, Call.CS->getType()); + Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp); + } + CSInfo.markDevirt(); +} + bool DevirtModule::tryUniqueRetValOpt( unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot, - MutableArrayRef<VirtualCallSite> CallSites) { + CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res, + VTableSlot Slot, ArrayRef<uint64_t> Args) { // IsOne controls whether we look for a 0 or a 1. auto tryUniqueRetValOptFor = [&](bool IsOne) { const TypeMemberInfo *UniqueMember = nullptr; @@ -533,16 +870,23 @@ bool DevirtModule::tryUniqueRetValOpt( // checked for a uniform return value in tryUniformRetValOpt. assert(UniqueMember); - // Replace each call with the comparison. - for (auto &&Call : CallSites) { - IRBuilder<> B(Call.CS.getInstruction()); - Value *OneAddr = B.CreateBitCast(UniqueMember->Bits->GV, Int8PtrTy); - OneAddr = B.CreateConstGEP1_64(OneAddr, UniqueMember->Offset); - Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, - Call.VTable, OneAddr); - Call.replaceAndErase("unique-ret-val", TargetsForSlot[0].Fn->getName(), - RemarksEnabled, Cmp); + Constant *UniqueMemberAddr = + ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy); + UniqueMemberAddr = ConstantExpr::getGetElementPtr( + Int8Ty, UniqueMemberAddr, + ConstantInt::get(Int64Ty, UniqueMember->Offset)); + + if (CSInfo.isExported()) { + Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal; + Res->Info = IsOne; + + exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr); } + + // Replace each call with the comparison. + applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne, + UniqueMemberAddr); + // Update devirtualization statistics for targets. if (RemarksEnabled) for (auto &&Target : TargetsForSlot) @@ -560,9 +904,30 @@ bool DevirtModule::tryUniqueRetValOpt( return false; } +void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName, + Constant *Byte, Constant *Bit) { + for (auto Call : CSInfo.CallSites) { + auto *RetType = cast<IntegerType>(Call.CS.getType()); + IRBuilder<> B(Call.CS.getInstruction()); + Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte); + if (RetType->getBitWidth() == 1) { + Value *Bits = B.CreateLoad(Addr); + Value *BitsAndBit = B.CreateAnd(Bits, Bit); + auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0)); + Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled, + IsBitSet); + } else { + Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo()); + Value *Val = B.CreateLoad(RetType, ValAddr); + Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val); + } + } + CSInfo.markDevirt(); +} + bool DevirtModule::tryVirtualConstProp( - MutableArrayRef<VirtualCallTarget> TargetsForSlot, - ArrayRef<VirtualCallSite> CallSites) { + MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, VTableSlot Slot) { // This only works if the function returns an integer. auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType()); if (!RetType) @@ -571,55 +936,38 @@ bool DevirtModule::tryVirtualConstProp( if (BitWidth > 64) return false; - // Make sure that each function does not access memory, takes at least one - // argument, does not use its first argument (which we assume is 'this'), - // and has the same return type. + // Make sure that each function is defined, does not access memory, takes at + // least one argument, does not use its first argument (which we assume is + // 'this'), and has the same return type. + // + // Note that we test whether this copy of the function is readnone, rather + // than testing function attributes, which must hold for any copy of the + // function, even a less optimized version substituted at link time. This is + // sound because the virtual constant propagation optimizations effectively + // inline all implementations of the virtual function into each call site, + // rather than using function attributes to perform local optimization. for (VirtualCallTarget &Target : TargetsForSlot) { - if (!Target.Fn->doesNotAccessMemory() || Target.Fn->arg_empty() || - !Target.Fn->arg_begin()->use_empty() || + if (Target.Fn->isDeclaration() || + computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) != + MAK_ReadNone || + Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() || Target.Fn->getReturnType() != RetType) return false; } - // Group call sites by the list of constant arguments they pass. - // The comparator ensures deterministic ordering. - struct ByAPIntValue { - bool operator()(const std::vector<ConstantInt *> &A, - const std::vector<ConstantInt *> &B) const { - return std::lexicographical_compare( - A.begin(), A.end(), B.begin(), B.end(), - [](ConstantInt *AI, ConstantInt *BI) { - return AI->getValue().ult(BI->getValue()); - }); - } - }; - std::map<std::vector<ConstantInt *>, std::vector<VirtualCallSite>, - ByAPIntValue> - VCallSitesByConstantArg; - for (auto &&VCallSite : CallSites) { - std::vector<ConstantInt *> Args; - if (VCallSite.CS.getType() != RetType) - continue; - for (auto &&Arg : - make_range(VCallSite.CS.arg_begin() + 1, VCallSite.CS.arg_end())) { - if (!isa<ConstantInt>(Arg)) - break; - Args.push_back(cast<ConstantInt>(&Arg)); - } - if (Args.size() + 1 != VCallSite.CS.arg_size()) - continue; - - VCallSitesByConstantArg[Args].push_back(VCallSite); - } - - for (auto &&CSByConstantArg : VCallSitesByConstantArg) { + for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) { if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first)) continue; - if (tryUniformRetValOpt(RetType, TargetsForSlot, CSByConstantArg.second)) + WholeProgramDevirtResolution::ByArg *ResByArg = nullptr; + if (Res) + ResByArg = &Res->ResByArg[CSByConstantArg.first]; + + if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg)) continue; - if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second)) + if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second, + ResByArg, Slot, CSByConstantArg.first)) continue; // Find an allocation offset in bits in all vtables associated with the @@ -659,26 +1007,20 @@ bool DevirtModule::tryVirtualConstProp( for (auto &&Target : TargetsForSlot) Target.WasDevirt = true; - // Rewrite each call to a load from OffsetByte/OffsetBit. - for (auto Call : CSByConstantArg.second) { - IRBuilder<> B(Call.CS.getInstruction()); - Value *Addr = B.CreateConstGEP1_64(Call.VTable, OffsetByte); - if (BitWidth == 1) { - Value *Bits = B.CreateLoad(Addr); - Value *Bit = ConstantInt::get(Int8Ty, 1ULL << OffsetBit); - Value *BitsAndBit = B.CreateAnd(Bits, Bit); - auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0)); - Call.replaceAndErase("virtual-const-prop-1-bit", - TargetsForSlot[0].Fn->getName(), - RemarksEnabled, IsBitSet); - } else { - Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo()); - Value *Val = B.CreateLoad(RetType, ValAddr); - Call.replaceAndErase("virtual-const-prop", - TargetsForSlot[0].Fn->getName(), - RemarksEnabled, Val); - } + Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte); + Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit); + + if (CSByConstantArg.second.isExported()) { + ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp; + exportGlobal(Slot, CSByConstantArg.first, "byte", + ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy)); + exportGlobal(Slot, CSByConstantArg.first, "bit", + ConstantExpr::getIntToPtr(BitConst, Int8PtrTy)); } + + // Rewrite each call to a load from OffsetByte/OffsetBit. + applyVirtualConstProp(CSByConstantArg.second, + TargetsForSlot[0].Fn->getName(), ByteConst, BitConst); } return true; } @@ -733,7 +1075,11 @@ bool DevirtModule::areRemarksEnabled() { if (FL.empty()) return false; const Function &Fn = FL.front(); - auto DI = OptimizationRemark(DEBUG_TYPE, Fn, DebugLoc(), ""); + + const auto &BBL = Fn.getBasicBlockList(); + if (BBL.empty()) + return false; + auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front()); return DI.isEnabled(); } @@ -766,8 +1112,8 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc, Value *Ptr = CI->getArgOperand(0)->stripPointerCasts(); if (SeenPtrs.insert(Ptr).second) { for (DevirtCallSite Call : DevirtCalls) { - CallSlots[{TypeId, Call.Offset}].push_back( - {CI->getArgOperand(0), Call.CS, nullptr}); + CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0), + Call.CS, nullptr); } } } @@ -853,14 +1199,79 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { if (HasNonCallUses) ++NumUnsafeUses; for (DevirtCallSite Call : DevirtCalls) { - CallSlots[{TypeId, Call.Offset}].push_back( - {Ptr, Call.CS, &NumUnsafeUses}); + CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS, + &NumUnsafeUses); } CI->eraseFromParent(); } } +void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { + const TypeIdSummary *TidSummary = + ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString()); + if (!TidSummary) + return; + auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset); + if (ResI == TidSummary->WPDRes.end()) + return; + const WholeProgramDevirtResolution &Res = ResI->second; + + if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) { + // The type of the function in the declaration is irrelevant because every + // call site will cast it to the correct type. + auto *SingleImpl = M.getOrInsertFunction( + Res.SingleImplName, Type::getVoidTy(M.getContext())); + + // This is the import phase so we should not be exporting anything. + bool IsExported = false; + applySingleImplDevirt(SlotInfo, SingleImpl, IsExported); + assert(!IsExported); + } + + for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) { + auto I = Res.ResByArg.find(CSByConstantArg.first); + if (I == Res.ResByArg.end()) + continue; + auto &ResByArg = I->second; + // FIXME: We should figure out what to do about the "function name" argument + // to the apply* functions, as the function names are unavailable during the + // importing phase. For now we just pass the empty string. This does not + // impact correctness because the function names are just used for remarks. + switch (ResByArg.TheKind) { + case WholeProgramDevirtResolution::ByArg::UniformRetVal: + applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info); + break; + case WholeProgramDevirtResolution::ByArg::UniqueRetVal: { + Constant *UniqueMemberAddr = + importGlobal(Slot, CSByConstantArg.first, "unique_member"); + applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info, + UniqueMemberAddr); + break; + } + case WholeProgramDevirtResolution::ByArg::VirtualConstProp: { + Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32); + Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty); + Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8); + Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty); + applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit); + } + default: + break; + } + } +} + +void DevirtModule::removeRedundantTypeTests() { + auto True = ConstantInt::getTrue(M.getContext()); + for (auto &&U : NumUnsafeUsesForTypeTest) { + if (U.second == 0) { + U.first->replaceAllUsesWith(True); + U.first->eraseFromParent(); + } + } +} + bool DevirtModule::run() { Function *TypeTestFunc = M.getFunction(Intrinsic::getName(Intrinsic::type_test)); @@ -868,7 +1279,11 @@ bool DevirtModule::run() { M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume)); - if ((!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc || + // Normally if there are no users of the devirtualization intrinsics in the + // module, this pass has nothing to do. But if we are exporting, we also need + // to handle any users that appear only in the function summaries. + if (!ExportSummary && + (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc || AssumeFunc->use_empty()) && (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty())) return false; @@ -879,6 +1294,17 @@ bool DevirtModule::run() { if (TypeCheckedLoadFunc) scanTypeCheckedLoadUsers(TypeCheckedLoadFunc); + if (ImportSummary) { + for (auto &S : CallSlots) + importResolution(S.first, S.second); + + removeRedundantTypeTests(); + + // The rest of the code is only necessary when exporting or during regular + // LTO, so we are done. + return true; + } + // Rebuild type metadata into a map for easy lookup. std::vector<VTableBits> Bits; DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap; @@ -886,6 +1312,53 @@ bool DevirtModule::run() { if (TypeIdMap.empty()) return true; + // Collect information from summary about which calls to try to devirtualize. + if (ExportSummary) { + DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID; + for (auto &P : TypeIdMap) { + if (auto *TypeId = dyn_cast<MDString>(P.first)) + MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back( + TypeId); + } + + for (auto &P : *ExportSummary) { + for (auto &S : P.second) { + auto *FS = dyn_cast<FunctionSummary>(S.get()); + if (!FS) + continue; + // FIXME: Only add live functions. + for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) { + for (Metadata *MD : MetadataByGUID[VF.GUID]) { + CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers = + true; + } + } + for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) { + for (Metadata *MD : MetadataByGUID[VF.GUID]) { + CallSlots[{MD, VF.Offset}] + .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS); + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_test_assume_const_vcalls()) { + for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) { + CallSlots[{MD, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .SummaryHasTypeTestAssumeUsers = true; + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_checked_load_const_vcalls()) { + for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) { + CallSlots[{MD, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .SummaryTypeCheckedLoadUsers.push_back(FS); + } + } + } + } + } + // For each (type, offset) pair: bool DidVirtualConstProp = false; std::map<std::string, Function*> DevirtTargets; @@ -894,19 +1367,39 @@ bool DevirtModule::run() { // function implementation at offset S.first.ByteOffset, and add to // TargetsForSlot. std::vector<VirtualCallTarget> TargetsForSlot; - if (!tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID], - S.first.ByteOffset)) - continue; - - if (!trySingleImplDevirt(TargetsForSlot, S.second) && - tryVirtualConstProp(TargetsForSlot, S.second)) + if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID], + S.first.ByteOffset)) { + WholeProgramDevirtResolution *Res = nullptr; + if (ExportSummary && isa<MDString>(S.first.TypeID)) + Res = &ExportSummary + ->getOrInsertTypeIdSummary( + cast<MDString>(S.first.TypeID)->getString()) + .WPDRes[S.first.ByteOffset]; + + if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) && + tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first)) DidVirtualConstProp = true; - // Collect functions devirtualized at least for one call site for stats. - if (RemarksEnabled) - for (const auto &T : TargetsForSlot) - if (T.WasDevirt) - DevirtTargets[T.Fn->getName()] = T.Fn; + // Collect functions devirtualized at least for one call site for stats. + if (RemarksEnabled) + for (const auto &T : TargetsForSlot) + if (T.WasDevirt) + DevirtTargets[T.Fn->getName()] = T.Fn; + } + + // CFI-specific: if we are exporting and any llvm.type.checked.load + // intrinsics were *not* devirtualized, we need to add the resulting + // llvm.type.test intrinsics to the function summaries so that the + // LowerTypeTests pass will export them. + if (ExportSummary && isa<MDString>(S.first.TypeID)) { + auto GUID = + GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString()); + for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers) + FS->addTypeTest(GUID); + for (auto &CCS : S.second.ConstCSInfo) + for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers) + FS->addTypeTest(GUID); + } } if (RemarksEnabled) { @@ -914,23 +1407,12 @@ bool DevirtModule::run() { for (const auto &DT : DevirtTargets) { Function *F = DT.second; DISubprogram *SP = F->getSubprogram(); - DebugLoc DL = SP ? DebugLoc::get(SP->getScopeLine(), 0, SP) : DebugLoc(); - emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, DL, + emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP, Twine("devirtualized ") + F->getName()); } } - // If we were able to eliminate all unsafe uses for a type checked load, - // eliminate the type test by replacing it with true. - if (TypeCheckedLoadFunc) { - auto True = ConstantInt::getTrue(M.getContext()); - for (auto &&U : NumUnsafeUsesForTypeTest) { - if (U.second == 0) { - U.first->replaceAllUsesWith(True); - U.first->eraseFromParent(); - } - } - } + removeRedundantTypeTests(); // Rebuild each global we touched as part of virtual constant propagation to // include the before and after bytes. |