aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp273
1 files changed, 102 insertions, 171 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 318c4c06f0f7..3721564890dd 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -24,12 +24,6 @@
// memcmp, strlen, etc.
// Future floating point idioms to recognize in -ffast-math mode:
// fpowi
-// Future integer operation idioms to recognize:
-// ctpop
-//
-// Beware that isel's default lowering for ctpop is highly inefficient for
-// i64 and larger types when i64 is legal and the value has few bits set. It
-// would be good to enhance isel to emit a loop for ctpop in this case.
//
// This could recognize common matrix multiplies and dot product idioms and
// replace them with calls to BLAS (if linked in??).
@@ -61,7 +55,6 @@
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
@@ -85,14 +78,11 @@
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@@ -255,62 +245,8 @@ private:
/// @}
};
-
-class LoopIdiomRecognizeLegacyPass : public LoopPass {
-public:
- static char ID;
-
- explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
- initializeLoopIdiomRecognizeLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (DisableLIRP::All)
- return false;
-
- if (skipLoop(L))
- return false;
-
- AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
- *L->getHeader()->getParent());
- const TargetTransformInfo *TTI =
- &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *L->getHeader()->getParent());
- const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
- auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- MemorySSA *MSSA = nullptr;
- if (MSSAAnalysis)
- MSSA = &MSSAAnalysis->getMSSA();
-
- // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
-
- LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
- return LIR.runOnLoop(L);
- }
-
- /// This transformation requires natural loop information & requires that
- /// loop preheaders be inserted into the CFG.
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
-};
-
} // end anonymous namespace
-char LoopIdiomRecognizeLegacyPass::ID = 0;
-
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
@@ -335,18 +271,8 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
return PA;
}
-INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
- "Recognize loop idioms", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
- "Recognize loop idioms", false, false)
-
-Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
-
static void deleteDeadInstruction(Instruction *I) {
- I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ I->replaceAllUsesWith(PoisonValue::get(I->getType()));
I->eraseFromParent();
}
@@ -442,7 +368,7 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
// array. We could theoretically do a store to an alloca or something, but
// that doesn't seem worthwhile.
Constant *C = dyn_cast<Constant>(V);
- if (!C)
+ if (!C || isa<ConstantExpr>(C))
return nullptr;
// Only handle simple values that are a power of two bytes in size.
@@ -497,8 +423,8 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
// When storing out scalable vectors we bail out for now, since the code
// below currently only works for constant strides.
TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
- if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
- (SizeInBits.getFixedSize() >> 32) != 0)
+ if (SizeInBits.isScalable() || (SizeInBits.getFixedValue() & 7) ||
+ (SizeInBits.getFixedValue() >> 32) != 0)
return LegalStoreKind::None;
// See if the pointer expression is an AddRec like {base,+,1} on the current
@@ -798,7 +724,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
}
/// processLoopMemIntrinsic - Template function for calling different processor
-/// functions based on mem instrinsic type.
+/// functions based on mem intrinsic type.
template <typename MemInst>
bool LoopIdiomRecognize::processLoopMemIntrinsic(
BasicBlock *BB,
@@ -995,9 +921,8 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
SmallPtrSet<Instruction *, 1> MSIs;
MSIs.insert(MSI);
return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
- MaybeAlign(MSI->getDestAlignment()),
- SplatValue, MSI, MSIs, Ev, BECount,
- IsNegStride, /*IsLoopMemset=*/true);
+ MSI->getDestAlign(), SplatValue, MSI, MSIs, Ev,
+ BECount, IsNegStride, /*IsLoopMemset=*/true);
}
/// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -1017,9 +942,13 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
// to be exactly the size of the memset, which is (BECount+1)*StoreSize
const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount);
const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
- if (BECst && ConstSize)
- AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
- ConstSize->getValue()->getZExtValue());
+ if (BECst && ConstSize) {
+ std::optional<uint64_t> BEInt = BECst->getAPInt().tryZExtValue();
+ std::optional<uint64_t> SizeInt = ConstSize->getAPInt().tryZExtValue();
+ // FIXME: Should this check for overflow?
+ if (BEInt && SizeInt)
+ AccessSize = LocationSize::precise((*BEInt + 1) * *SizeInt);
+ }
// TODO: For this to be really effective, we have to dive into the pointer
// operand in the store. Store to &A[i] of 100 will always return may alias
@@ -1030,8 +959,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
for (BasicBlock *B : L->blocks())
for (Instruction &I : *B)
if (!IgnoredInsts.contains(&I) &&
- isModOrRefSet(
- intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
+ isModOrRefSet(AA.getModRefInfo(&I, StoreLoc) & Access))
return true;
return false;
}
@@ -1053,33 +981,6 @@ static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
return SE->getMinusSCEV(Start, Index);
}
-/// Compute trip count from the backedge taken count.
-static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
- Loop *CurLoop, const DataLayout *DL,
- ScalarEvolution *SE) {
- const SCEV *TripCountS = nullptr;
- // The # stored bytes is (BECount+1). Expand the trip count out to
- // pointer size if it isn't already.
- //
- // If we're going to need to zero extend the BE count, check if we can add
- // one to it prior to zero extending without overflow. Provided this is safe,
- // it allows better simplification of the +1.
- if (DL->getTypeSizeInBits(BECount->getType()) <
- DL->getTypeSizeInBits(IntPtr) &&
- SE->isLoopEntryGuardedByCond(
- CurLoop, ICmpInst::ICMP_NE, BECount,
- SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
- TripCountS = SE->getZeroExtendExpr(
- SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
- IntPtr);
- } else {
- TripCountS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
- SE->getOne(IntPtr), SCEV::FlagNUW);
- }
-
- return TripCountS;
-}
-
/// Compute the number of bytes as a SCEV from the backedge taken count.
///
/// This also maps the SCEV into the provided type and tries to handle the
@@ -1087,8 +988,8 @@ static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
const SCEV *StoreSizeSCEV, Loop *CurLoop,
const DataLayout *DL, ScalarEvolution *SE) {
- const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
-
+ const SCEV *TripCountSCEV =
+ SE->getTripCountFromExitCount(BECount, IntPtr, CurLoop);
return SE->getMulExpr(TripCountSCEV,
SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntPtr),
SCEV::FlagNUW);
@@ -1101,6 +1002,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
Value *StoredVal, Instruction *TheStore,
SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
const SCEV *BECount, bool IsNegStride, bool IsLoopMemset) {
+ Module *M = TheStore->getModule();
Value *SplatValue = isBytewiseValue(StoredVal, *DL);
Constant *PatternValue = nullptr;
@@ -1119,7 +1021,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
SCEVExpander Expander(*SE, *DL, "loop-idiom");
SCEVExpanderCleaner ExpCleaner(Expander);
- Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+ Type *DestInt8PtrTy = Builder.getPtrTy(DestAS);
Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
bool Changed = false;
@@ -1130,7 +1032,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// TODO: ideally we should still be able to generate memset if SCEV expander
// is taught to generate the dependencies at the latest point.
- if (!isSafeToExpand(Start, *SE))
+ if (!Expander.isSafeToExpand(Start))
return Changed;
// Okay, we have a strided store "p[i]" of a splattable value. We can turn
@@ -1164,32 +1066,37 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// TODO: ideally we should still be able to generate memset if SCEV expander
// is taught to generate the dependencies at the latest point.
- if (!isSafeToExpand(NumBytesS, *SE))
+ if (!Expander.isSafeToExpand(NumBytesS))
return Changed;
Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+ if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
+ return Changed;
+
+ AAMDNodes AATags = TheStore->getAAMetadata();
+ for (Instruction *Store : Stores)
+ AATags = AATags.merge(Store->getAAMetadata());
+ if (auto CI = dyn_cast<ConstantInt>(NumBytes))
+ AATags = AATags.extendTo(CI->getZExtValue());
+ else
+ AATags = AATags.extendTo(-1);
+
CallInst *NewCall;
if (SplatValue) {
- AAMDNodes AATags = TheStore->getAAMetadata();
- if (auto CI = dyn_cast<ConstantInt>(NumBytes))
- AATags = AATags.extendTo(CI->getZExtValue());
- else
- AATags = AATags.extendTo(-1);
-
NewCall = Builder.CreateMemSet(
BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
} else {
+ assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
// Everything is emitted in default address space
Type *Int8PtrTy = DestInt8PtrTy;
- Module *M = TheStore->getModule();
StringRef FuncName = "memset_pattern16";
- FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
- Int8PtrTy, Int8PtrTy, IntIdxTy);
- inferLibFuncAttributes(M, FuncName, *TLI);
+ FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
+ Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
+ inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
// Otherwise we should form a memset_pattern16. PatternValue is known to be
// an constant array of 16-bytes. Plop the value into a mergable global.
@@ -1198,9 +1105,20 @@ bool LoopIdiomRecognize::processLoopStridedStore(
PatternValue, ".memset_pattern");
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
GV->setAlignment(Align(16));
- Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
+ Value *PatternPtr = GV;
NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
- }
+
+ // Set the TBAA info if present.
+ if (AATags.TBAA)
+ NewCall->setMetadata(LLVMContext::MD_tbaa, AATags.TBAA);
+
+ if (AATags.Scope)
+ NewCall->setMetadata(LLVMContext::MD_alias_scope, AATags.Scope);
+
+ if (AATags.NoAlias)
+ NewCall->setMetadata(LLVMContext::MD_noalias, AATags.NoAlias);
+ }
+
NewCall->setDebugLoc(TheStore->getDebugLoc());
if (MSSAU) {
@@ -1271,13 +1189,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
StoreEv, LoadEv, BECount);
}
+namespace {
class MemmoveVerifier {
public:
explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
const DataLayout &DL)
- : DL(DL), LoadOff(0), StoreOff(0),
- BP1(llvm::GetPointerBaseWithConstantOffset(
- LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
+ : DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
+ LoadBasePtr.stripPointerCasts(), LoadOff, DL)),
BP2(llvm::GetPointerBaseWithConstantOffset(
StoreBasePtr.stripPointerCasts(), StoreOff, DL)),
IsSameObject(BP1 == BP2) {}
@@ -1295,7 +1213,7 @@ public:
// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
// for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
int64_t LoadSize =
- DL.getTypeSizeInBits(TheLoad.getType()).getFixedSize() / 8;
+ DL.getTypeSizeInBits(TheLoad.getType()).getFixedValue() / 8;
if (BP1 != BP2 || LoadSize != int64_t(StoreSize))
return false;
if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) ||
@@ -1307,14 +1225,15 @@ public:
private:
const DataLayout &DL;
- int64_t LoadOff;
- int64_t StoreOff;
+ int64_t LoadOff = 0;
+ int64_t StoreOff = 0;
const Value *BP1;
const Value *BP2;
public:
const bool IsSameObject;
};
+} // namespace
bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
@@ -1363,7 +1282,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
// feeds the stores. Check for an alias by generating the base address and
// checking everything.
Value *StoreBasePtr = Expander.expandCodeFor(
- StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+ StrStart, Builder.getPtrTy(StrAS), Preheader->getTerminator());
// From here on out, conservatively report to the pass manager that we've
// changed the IR, even if we later clean up these added instructions. There
@@ -1415,31 +1334,24 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
// For a memcpy, we have to make sure that the input array is not being
// mutated by the loop.
- Value *LoadBasePtr = Expander.expandCodeFor(
- LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+ Value *LoadBasePtr = Expander.expandCodeFor(LdStart, Builder.getPtrTy(LdAS),
+ Preheader->getTerminator());
// If the store is a memcpy instruction, we must check if it will write to
// the load memory locations. So remove it from the ignored stores.
- if (IsMemCpy)
- IgnoredInsts.erase(TheStore);
MemmoveVerifier Verifier(*LoadBasePtr, *StoreBasePtr, *DL);
+ if (IsMemCpy && !Verifier.IsSameObject)
+ IgnoredInsts.erase(TheStore);
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
StoreSizeSCEV, *AA, IgnoredInsts)) {
- if (!IsMemCpy) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad",
- TheLoad)
- << ore::NV("Inst", InstRemark) << " in "
- << ore::NV("Function", TheStore->getFunction())
- << " function will not be hoisted: "
- << ore::NV("Reason", "The loop may access load location");
- });
- return Changed;
- }
- // At this point loop may access load only for memcpy in same underlying
- // object. If that's not the case bail out.
- if (!Verifier.IsSameObject)
- return Changed;
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
+ << ore::NV("Inst", InstRemark) << " in "
+ << ore::NV("Function", TheStore->getFunction())
+ << " function will not be hoisted: "
+ << ore::NV("Reason", "The loop may access load location");
+ });
+ return Changed;
}
bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
@@ -1487,9 +1399,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
return Changed;
// We cannot allow unaligned ops for unordered load/store, so reject
// anything where the alignment isn't at least the element size.
- assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
+ assert((StoreAlign && LoadAlign) &&
"Expect unordered load/store to have align.");
- if (StoreAlign.getValue() < StoreSize || LoadAlign.getValue() < StoreSize)
+ if (*StoreAlign < StoreSize || *LoadAlign < StoreSize)
return Changed;
// If the element.atomic memcpy is not lowered into explicit
@@ -1503,9 +1415,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
// Note that unordered atomic loads/stores are *required* by the spec to
// have an alignment but non-atomic loads/stores may not.
NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
- StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
- NumBytes, StoreSize, AATags.TBAA, AATags.TBAAStruct, AATags.Scope,
- AATags.NoAlias);
+ StoreBasePtr, *StoreAlign, LoadBasePtr, *LoadAlign, NumBytes, StoreSize,
+ AATags.TBAA, AATags.TBAAStruct, AATags.Scope, AATags.NoAlias);
}
NewCall->setDebugLoc(TheStore->getDebugLoc());
@@ -2113,7 +2024,8 @@ void LoopIdiomRecognize::transformLoopToCountable(
auto *LbBr = cast<BranchInst>(Body->getTerminator());
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
- PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi", &Body->front());
+ PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
+ TcPhi->insertBefore(Body->begin());
Builder.SetInsertPoint(LbCond);
Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
@@ -2219,7 +2131,8 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
Type *Ty = TripCnt->getType();
- PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi");
+ TcPhi->insertBefore(Body->begin());
Builder.SetInsertPoint(LbCond);
Instruction *TcDec = cast<Instruction>(
@@ -2477,7 +2390,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// intrinsic/shift we'll use are not cheap. Note that we are okay with *just*
// making the loop countable, even if nothing else changes.
IntrinsicCostAttributes Attrs(
- IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getTrue()});
+ IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getTrue()});
InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
if (Cost > TargetTransformInfo::TCC_Basic) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
@@ -2493,6 +2406,24 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// Ok, transform appears worthwhile.
MadeChange = true;
+ if (!isGuaranteedNotToBeUndefOrPoison(BitPos)) {
+ // BitMask may be computed from BitPos, Freeze BitPos so we can increase
+ // it's use count.
+ Instruction *InsertPt = nullptr;
+ if (auto *BitPosI = dyn_cast<Instruction>(BitPos))
+ InsertPt = &**BitPosI->getInsertionPointAfterDef();
+ else
+ InsertPt = &*DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
+ if (!InsertPt)
+ return false;
+ FreezeInst *BitPosFrozen =
+ new FreezeInst(BitPos, BitPos->getName() + ".fr", InsertPt);
+ BitPos->replaceUsesWithIf(BitPosFrozen, [BitPosFrozen](Use &U) {
+ return U.getUser() != BitPosFrozen;
+ });
+ BitPos = BitPosFrozen;
+ }
+
// Step 1: Compute the loop trip count.
Value *LowBitMask = Builder.CreateAdd(BitMask, Constant::getAllOnesValue(Ty),
@@ -2501,7 +2432,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
Builder.CreateOr(LowBitMask, BitMask, BitPos->getName() + ".mask");
Value *XMasked = Builder.CreateAnd(X, Mask, X->getName() + ".masked");
CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
- IntrID, Ty, {XMasked, /*is_zero_undef=*/Builder.getTrue()},
+ IntrID, Ty, {XMasked, /*is_zero_poison=*/Builder.getTrue()},
/*FMFSource=*/nullptr, XMasked->getName() + ".numleadingzeros");
Value *XMaskedNumActiveBits = Builder.CreateSub(
ConstantInt::get(Ty, Ty->getScalarSizeInBits()), XMaskedNumLeadingZeros,
@@ -2562,7 +2493,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
// Step 4: Rewrite the loop into a countable form, with canonical IV.
// The new canonical induction variable.
- Builder.SetInsertPoint(&LoopHeaderBB->front());
+ Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
auto *IV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
// The induction itself.
@@ -2831,7 +2762,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// intrinsic we'll use are not cheap. Note that we are okay with *just*
// making the loop countable, even if nothing else changes.
IntrinsicCostAttributes Attrs(
- IntrID, Ty, {UndefValue::get(Ty), /*is_zero_undef=*/Builder.getFalse()});
+ IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
if (Cost > TargetTransformInfo::TCC_Basic) {
LLVM_DEBUG(dbgs() << DEBUG_TYPE
@@ -2849,7 +2780,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// Step 1: Compute the loop's final IV value / trip count.
CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
- IntrID, Ty, {Val, /*is_zero_undef=*/Builder.getFalse()},
+ IntrID, Ty, {Val, /*is_zero_poison=*/Builder.getFalse()},
/*FMFSource=*/nullptr, Val->getName() + ".numleadingzeros");
Value *ValNumActiveBits = Builder.CreateSub(
ConstantInt::get(Ty, Ty->getScalarSizeInBits()), ValNumLeadingZeros,
@@ -2886,11 +2817,11 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
// Step 3: Rewrite the loop into a countable form, with canonical IV.
// The new canonical induction variable.
- Builder.SetInsertPoint(&LoopHeaderBB->front());
+ Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->begin());
auto *CIV = Builder.CreatePHI(Ty, 2, CurLoop->getName() + ".iv");
// The induction itself.
- Builder.SetInsertPoint(LoopHeaderBB->getFirstNonPHI());
+ Builder.SetInsertPoint(LoopHeaderBB, LoopHeaderBB->getFirstNonPHIIt());
auto *CIVNext =
Builder.CreateAdd(CIV, ConstantInt::get(Ty, 1), CIV->getName() + ".next",
/*HasNUW=*/true, /*HasNSW=*/Bitwidth != 2);