diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp | 505 |
1 files changed, 160 insertions, 345 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp index 5bf3522ab2e6..cccac5595288 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -22,23 +22,13 @@ /// The HardwareLoops pass inserts intrinsics identifying loops that the /// backend will attempt to convert into a low-overhead loop. The vectorizer is /// responsible for generating a vectorized loop in which the lanes are -/// predicated upon the iteration counter. This pass looks at these predicated -/// vector loops, that are targets for low-overhead loops, and prepares it for -/// code generation. Once the vectorizer has produced a masked loop, there's a -/// couple of final forms: -/// - A tail-predicated loop, with implicit predication. -/// - A loop containing multiple VCPT instructions, predicating multiple VPT -/// blocks of instructions operating on different vector types. -/// -/// This pass: -/// 1) Checks if the predicates of the masked load/store instructions are -/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes -/// the Backedge Taken Count (BTC) of the scalar loop as its second argument, -/// which we extract to set up the number of elements processed by the loop. -/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target -/// specific VCTP intrinsic to represent the effect of tail predication. -/// This will be picked up by the ARM Low-overhead loop pass, which performs -/// the final transformation to a DLSTP or WLSTP tail-predicated loop. +/// predicated upon an get.active.lane.mask intrinsic. This pass looks at these +/// get.active.lane.mask intrinsic and attempts to convert them to VCTP +/// instructions. This will be picked up by the ARM Low-overhead loop pass later +/// in the backend, which performs the final transformation to a DLSTP or WLSTP +/// tail-predicated loop. +// +//===----------------------------------------------------------------------===// #include "ARM.h" #include "ARMSubtarget.h" @@ -57,6 +47,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -66,8 +57,8 @@ using namespace llvm; #define DESC "Transform predicated vector loops to use MVE tail predication" cl::opt<TailPredication::Mode> EnableTailPredication( - "tail-predication", cl::desc("MVE tail-predication options"), - cl::init(TailPredication::Disabled), + "tail-predication", cl::desc("MVE tail-predication pass options"), + cl::init(TailPredication::Enabled), cl::values(clEnumValN(TailPredication::Disabled, "disabled", "Don't tail-predicate loops"), clEnumValN(TailPredication::EnabledNoReductions, @@ -112,23 +103,18 @@ public: bool runOnLoop(Loop *L, LPPassManager&) override; private: - /// Perform the relevant checks on the loop and convert if possible. - bool TryConvert(Value *TripCount); + /// Perform the relevant checks on the loop and convert active lane masks if + /// possible. + bool TryConvertActiveLaneMask(Value *TripCount); - /// Return whether this is a vectorized loop, that contains masked - /// load/stores. - bool IsPredicatedVectorLoop(); - - /// Perform checks on the arguments of @llvm.get.active.lane.mask - /// intrinsic: check if the first is a loop induction variable, and for the - /// the second check that no overflow can occur in the expression that use - /// this backedge-taken count. - bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy); + /// Perform several checks on the arguments of @llvm.get.active.lane.mask + /// intrinsic. E.g., check that the loop induction variable and the element + /// count are of the form we expect, and also perform overflow checks for + /// the new expressions that are created. + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Insert the intrinsic to represent the effect of tail predication. - void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, - FixedVectorType *VecTy); + void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount); /// Rematerialize the iteration count in exit blocks, which enables /// ARMLowOverheadLoops to better optimise away loop update statements inside @@ -138,25 +124,6 @@ private: } // end namespace -static bool IsDecrement(Instruction &I) { - auto *Call = dyn_cast<IntrinsicInst>(&I); - if (!Call) - return false; - - Intrinsic::ID ID = Call->getIntrinsicID(); - return ID == Intrinsic::loop_decrement_reg; -} - -static bool IsMasked(Instruction *I) { - auto *Call = dyn_cast<IntrinsicInst>(I); - if (!Call) - return false; - - Intrinsic::ID ID = Call->getIntrinsicID(); - // TODO: Support gather/scatter expand/compress operations. - return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; -} - bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || !EnableTailPredication) return false; @@ -188,7 +155,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { continue; Intrinsic::ID ID = Call->getIntrinsicID(); - if (ID == Intrinsic::set_loop_iterations || + if (ID == Intrinsic::start_loop_iterations || ID == Intrinsic::test_set_loop_iterations) return cast<IntrinsicInst>(&I); } @@ -207,148 +174,23 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { return false; } - // Search for the hardware loop intrinic that decrements the loop counter. - IntrinsicInst *Decrement = nullptr; - for (auto *BB : L->getBlocks()) { - for (auto &I : *BB) { - if (IsDecrement(I)) { - Decrement = cast<IntrinsicInst>(&I); - break; - } - } - } - - if (!Decrement) - return false; - - LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" - << *Decrement << "\n"); - - if (!TryConvert(Setup->getArgOperand(0))) { - LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); - return false; - } - - return true; -} - -static FixedVectorType *getVectorType(IntrinsicInst *I) { - unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; - auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType()); - auto *VecTy = cast<FixedVectorType>(PtrTy->getElementType()); - assert(VecTy && "No scalable vectors expected here"); - return VecTy; -} + LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"); -bool MVETailPredication::IsPredicatedVectorLoop() { - // Check that the loop contains at least one masked load/store intrinsic. - // We only support 'normal' vector instructions - other than masked - // load/stores. - bool ActiveLaneMask = false; - for (auto *BB : L->getBlocks()) { - for (auto &I : *BB) { - auto *Int = dyn_cast<IntrinsicInst>(&I); - if (!Int) - continue; + bool Changed = TryConvertActiveLaneMask(Setup->getArgOperand(0)); - switch (Int->getIntrinsicID()) { - case Intrinsic::get_active_lane_mask: - ActiveLaneMask = true; - LLVM_FALLTHROUGH; - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - case Intrinsic::ssub_sat: - case Intrinsic::usub_sat: - continue; - case Intrinsic::fma: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::round: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::fabs: - if (ST->hasMVEFloatOps()) - continue; - LLVM_FALLTHROUGH; - default: - break; - } - - if (IsMasked(&I)) { - auto *VecTy = getVectorType(Int); - unsigned Lanes = VecTy->getNumElements(); - unsigned ElementWidth = VecTy->getScalarSizeInBits(); - // MVE vectors are 128-bit, but don't support 128 x i1. - // TODO: Can we support vectors larger than 128-bits? - unsigned MaxWidth = TTI->getRegisterBitWidth(true); - if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) - return false; - MaskedInsts.push_back(cast<IntrinsicInst>(&I)); - continue; - } - - for (const Use &U : Int->args()) { - if (isa<VectorType>(U->getType())) - return false; - } - } - } - - if (!ActiveLaneMask) { - LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n"); - return false; - } - return !MaskedInsts.empty(); -} - -// Look through the exit block to see whether there's a duplicate predicate -// instruction. This can happen when we need to perform a select on values -// from the last and previous iteration. Instead of doing a straight -// replacement of that predicate with the vctp, clone the vctp and place it -// in the block. This means that the VPR doesn't have to be live into the -// exit block which should make it easier to convert this loop into a proper -// tail predicated loop. -static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) { - BasicBlock *Exit = L->getUniqueExitBlock(); - if (!Exit) { - LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n"); - return; - } - - // Drop references and add operands to check for dead. - SmallPtrSet<Instruction*, 4> Dead; - while (!MaybeDead.empty()) { - auto *I = MaybeDead.front(); - MaybeDead.remove(I); - if (I->hasNUsesOrMore(1)) - continue; - - for (auto &U : I->operands()) - if (auto *OpI = dyn_cast<Instruction>(U)) - MaybeDead.insert(OpI); - - Dead.insert(I); - } - - for (auto *I : Dead) { - LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump()); - I->eraseFromParent(); - } - - for (auto I : L->blocks()) - DeleteDeadPHIs(I); + return Changed; } // The active lane intrinsic has this form: // -// @llvm.get.active.lane.mask(IV, BTC) +// @llvm.get.active.lane.mask(IV, TC) // // Here we perform checks that this intrinsic behaves as expected, // which means: // -// 1) The element count, which is calculated with BTC + 1, cannot overflow. -// 2) The element count needs to be sufficiently large that the decrement of -// element counter doesn't overflow, which means that we need to prove: +// 1) Check that the TripCount (TC) belongs to this loop (originally). +// 2) The element count (TC) needs to be sufficiently large that the decrement +// of element counter doesn't overflow, which means that we need to prove: // ceil(ElementCount / VectorWidth) >= TripCount // by rounding up ElementCount up: // ((ElementCount + (VectorWidth - 1)) / VectorWidth @@ -357,109 +199,122 @@ static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) { // 3) The IV must be an induction phi with an increment equal to the // vector width. bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, - Value *TripCount, FixedVectorType *VecTy) { + Value *TripCount) { bool ForceTailPredication = EnableTailPredication == TailPredication::ForceEnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabled; - // 1) Test whether entry to the loop is protected by a conditional - // BTC + 1 < 0. In other words, if the scalar trip count overflows, - // becomes negative, we shouldn't enter the loop and creating - // tripcount expression BTC + 1 is not safe. So, check that BTC - // isn't max. This is evaluated in unsigned, because the semantics - // of @get.active.lane.mask is a ULE comparison. - - int VectorWidth = VecTy->getNumElements(); - auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); - auto *BTC = SE->getSCEV(BackedgeTakenCount); - - if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) && - !ForceTailPredication) { - LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: "; - BTC->dump()); + + Value *ElemCount = ActiveLaneMask->getOperand(1); + bool Changed = false; + if (!L->makeLoopInvariant(ElemCount, Changed)) return false; - } - // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow: - // - // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount - // - // 2.1) First prove overflow can't happen in: - // - // ElementCount + (VectorWidth - 1) - // - // Because of a lack of context, it is difficult to get a useful bounds on - // this expression. But since ElementCount uses the same variables as the - // TripCount (TC), for which we can find meaningful value ranges, we use that - // instead and assert that: - // - // upperbound(TC) <= UINT_MAX - VectorWidth - // + auto *EC= SE->getSCEV(ElemCount); auto *TC = SE->getSCEV(TripCount); - unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits(); - auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth); - uint64_t MaxMinusVW = Diff.getZExtValue(); - uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue(); - - if (UpperboundTC > MaxMinusVW && !ForceTailPredication) { - LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n"; - dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n"; - dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";); + int VectorWidth = + cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); + if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) return false; - } + ConstantInt *ConstElemCount = nullptr; - // 2.2) Make sure overflow doesn't happen in final expression: - // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount, - // To do this, compare the full ranges of these subexpressions: - // - // Range(Ceil) <= Range(TC) - // - // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime - // values (and not constants), we have to compensate for the lowerbound value - // range to be off by 1. The reason is that BTC lives in the preheader in - // this form: - // - // %trip.count.minus = add nsw nuw i32 %N, -1 - // - // For the loop to be executed, %N has to be >= 1 and as a result the value - // range of %trip.count.minus has a lower bound of 0. Value %TC has this form: - // - // %5 = add nuw nsw i32 %4, 1 - // call void @llvm.set.loop.iterations.i32(i32 %5) - // - // where %5 is some expression using %N, which needs to have a lower bound of - // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set, - // we first add 0 to TC such that we can do the <= comparison on both sets. - // - auto *One = SE->getOne(TripCount->getType()); - // ElementCount = BTC + 1 - auto *ElementCount = SE->getAddExpr(BTC, One); - // Tmp = ElementCount + (VW-1) - auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount, - SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); - // Ceil = ElementCount + (VW-1) / VW - auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, - SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth))); - - ConstantRange RangeCeil = SE->getSignedRange(Ceil) ; - ConstantRange RangeTC = SE->getSignedRange(TC) ; - if (!RangeTC.isSingleElement()) { - auto ZeroRange = - ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0)); - RangeTC = RangeTC.unionWith(ZeroRange); - } - if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) { - LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n"); + // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to + // this loop. The scalar tripcount corresponds the number of elements + // processed by the loop, so we will refer to that from this point on. + if (!SE->isLoopInvariant(EC, L)) { + LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n"); return false; } - // 3) Find out if IV is an induction phi. Note that We can't use Loop + if ((ConstElemCount = dyn_cast<ConstantInt>(ElemCount))) { + ConstantInt *TC = dyn_cast<ConstantInt>(TripCount); + if (!TC) { + LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in " + "set.loop.iterations\n"); + return false; + } + + // Calculate 2 tripcount values and check that they are consistent with + // each other. The TripCount for a predicated vector loop body is + // ceil(ElementCount/Width), or floor((ElementCount+Width-1)/Width) as we + // work it out here. + uint64_t TC1 = TC->getZExtValue(); + uint64_t TC2 = + (ConstElemCount->getZExtValue() + VectorWidth - 1) / VectorWidth; + + // If the tripcount values are inconsistent, we can't insert the VCTP and + // trigger tail-predication; keep the intrinsic as a get.active.lane.mask + // and legalize this. + if (TC1 != TC2) { + LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: " + << TC1 << " from set.loop.iterations, and " + << TC2 << " from get.active.lane.mask\n"); + return false; + } + } else if (!ForceTailPredication) { + // 2) We need to prove that the sub expression that we create in the + // tail-predicated loop body, which calculates the remaining elements to be + // processed, is non-negative, i.e. it doesn't overflow: + // + // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0 + // + // This is true if: + // + // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth + // + // which what we will be using here. + // + auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)); + // ElementCount + (VW-1): + auto *ECPlusVWMinus1 = SE->getAddExpr(EC, + SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1))); + + // Ceil = ElementCount + (VW-1) / VW + auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW); + + // Prevent unused variable warnings with TC + (void)TC; + LLVM_DEBUG( + dbgs() << "ARM TP: Analysing overflow behaviour for:\n"; + dbgs() << "ARM TP: - TripCount = "; TC->dump(); + dbgs() << "ARM TP: - ElemCount = "; EC->dump(); + dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n"; + dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump(); + ); + + // As an example, almost all the tripcount expressions (produced by the + // vectoriser) look like this: + // + // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4) + // + // and "ElementCount + (VW-1) / VW": + // + // Ceil = ((3 + %N) /u 4) + // + // Check for equality of TC and Ceil by calculating SCEV expression + // TC - Ceil and test it for zero. + // + bool Zero = SE->getMinusSCEV( + SE->getBackedgeTakenCount(L), + SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW), + SE->getNegativeSCEV(VW)), + VW)) + ->isZero(); + + if (!Zero) { + LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n"); + return false; + } + } + + // 3) Find out if IV is an induction phi. Note that we can't use Loop // helpers here to get the induction variable, because the hardware loop is - // no longer in loopsimplify form, and also the hwloop intrinsic use a - // different counter. Using SCEV, we check that the induction is of the + // no longer in loopsimplify form, and also the hwloop intrinsic uses a + // different counter. Using SCEV, we check that the induction is of the // form i = i + 4, where the increment must be equal to the VectorWidth. auto *IV = ActiveLaneMask->getOperand(0); auto *IVExpr = SE->getSCEV(IV); auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr); + if (!AddExpr) { LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump()); return false; @@ -469,6 +324,11 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n"); return false; } + auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0)); + if (!Base || !Base->isZero()) { + LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n"); + return false; + } auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1)); if (!Step) { LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: "; @@ -479,68 +339,29 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, if (VectorWidth == StepValue) return true; - LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match " - "vector width " << VectorWidth << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue + << " doesn't match vector width " << VectorWidth << "\n"); return false; } -// Materialize NumElements in the preheader block. -static Value *getNumElements(BasicBlock *Preheader, Value *BTC) { - // First, check the preheader if it not already exist: - // - // preheader: - // %BTC = add i32 %N, -1 - // .. - // vector.body: - // - // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1, - // but instead can just return %N. - for (auto &I : *Preheader) { - if (I.getOpcode() != Instruction::Add || &I != BTC) - continue; - ConstantInt *MinusOne = nullptr; - if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1)))) - continue; - if (MinusOne->getSExtValue() == -1) { - LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n"); - return I.getOperand(0); - } - } - - // But we do need to materialise BTC if it is not already there, - // e.g. if it is a constant. - IRBuilder<> Builder(Preheader->getTerminator()); - Value *NumElements = Builder.CreateAdd(BTC, - ConstantInt::get(BTC->getType(), 1), "num.elements"); - LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n"); - return NumElements; -} - void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, - Value *TripCount, FixedVectorType *VecTy) { + Value *TripCount) { IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); - unsigned VectorWidth = VecTy->getNumElements(); - - // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, - // is one less than the trip count. So we need to find or create - // %num.elements = %BTC + 1 in the preheader. - Value *BTC = ActiveLaneMask->getOperand(1); - Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator()); - Value *NumElements = getNumElements(L->getLoopPreheader(), BTC); + unsigned VectorWidth = + cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); // Insert a phi to count the number of elements processed by the loop. - Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() ); + Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()); PHINode *Processed = Builder.CreatePHI(Ty, 2); - Processed->addIncoming(NumElements, L->getLoopPreheader()); + Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader()); - // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus - // represent the effect of tail predication. + // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and + // thus represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); - ConstantInt *Factor = - ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); + ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth); Intrinsic::ID VCTPID; switch (VectorWidth) { @@ -569,42 +390,36 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n"); } -bool MVETailPredication::TryConvert(Value *TripCount) { - if (!IsPredicatedVectorLoop()) { - LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n"); +bool MVETailPredication::TryConvertActiveLaneMask(Value *TripCount) { + SmallVector<IntrinsicInst *, 4> ActiveLaneMasks; + for (auto *BB : L->getBlocks()) + for (auto &I : *BB) + if (auto *Int = dyn_cast<IntrinsicInst>(&I)) + if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) + ActiveLaneMasks.push_back(Int); + + if (ActiveLaneMasks.empty()) return false; - } LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n"); - SetVector<Instruction*> Predicates; - - // Walk through the masked intrinsics and try to find whether the predicate - // operand is generated by intrinsic @llvm.get.active.lane.mask(). - for (auto *I : MaskedInsts) { - unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3; - auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp)); - if (!Predicate || Predicates.count(Predicate)) - continue; - - auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate); - if (!ActiveLaneMask || - ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) - continue; - - Predicates.insert(Predicate); + + for (auto *ActiveLaneMask : ActiveLaneMasks) { LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - auto *VecTy = getVectorType(I); - if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) { + if (!IsSafeActiveMask(ActiveLaneMask, TripCount)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n"); - InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy); + InsertVCTPIntrinsic(ActiveLaneMask, TripCount); } - Cleanup(Predicates, L); + // Remove dead instructions and now dead phis. + for (auto *II : ActiveLaneMasks) + RecursivelyDeleteTriviallyDeadInstructions(II); + for (auto I : L->blocks()) + DeleteDeadPHIs(I); return true; } |