diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 164 |
1 files changed, 89 insertions, 75 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 1ddf6686b97e..bbd262748d68 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -16,7 +16,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPUIGroupLP.h" -#include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -191,7 +190,7 @@ public: bool allowedByRules(const SUnit *SU, SmallVectorImpl<SchedGroup> &SyncPipe) const { for (auto &Rule : Rules) { - if (!Rule.get()->apply(SU, Collection, SyncPipe)) + if (!Rule->apply(SU, Collection, SyncPipe)) return false; } return true; @@ -240,23 +239,6 @@ public: } }; -// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. -static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { - assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || - SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || - SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT); - - while (!SU.Preds.empty()) - for (auto &P : SU.Preds) - SU.removePred(P); - - while (!SU.Succs.empty()) - for (auto &S : SU.Succs) - for (auto &SP : S.getSUnit()->Preds) - if (SP.getSUnit() == &SU) - S.getSUnit()->removePred(SP); -} - using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>; using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>; @@ -270,7 +252,7 @@ using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>; // only be used for small sized problems or medium sized problems where an exact // solution is highly desired. class PipelineSolver { - ScheduleDAGMI *DAG; + [[maybe_unused]] ScheduleDAGMI *DAG; // Instructions that can be assigned to multiple SchedGroups DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; @@ -394,7 +376,7 @@ void PipelineSolver::reset() { for (auto &SG : SyncPipeline) { SmallVector<SUnit *, 32> TempCollection = SG.Collection; SG.Collection.clear(); - auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { + auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; }); if (SchedBarr != TempCollection.end()) @@ -421,7 +403,7 @@ void PipelineSolver::convertSyncMapsToArrays() { std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); continue; } - auto SortPosition = PipelineInstrs[PipelineIDx].begin(); + auto *SortPosition = PipelineInstrs[PipelineIDx].begin(); // Insert them in sorted order -- this allows for good parsing order in // the greedy algorithm while (SortPosition != PipelineInstrs[PipelineIDx].end() && @@ -460,7 +442,6 @@ void PipelineSolver::makePipeline() { // Command line requested IGroupLP doesn't have SGBarr if (!SGBarr) continue; - resetEdges(*SGBarr, DAG); SG.link(*SGBarr, false); } } @@ -515,7 +496,7 @@ void PipelineSolver::removeEdges( SUnit *Pred = PredSuccPair.first; SUnit *Succ = PredSuccPair.second; - auto Match = llvm::find_if( + auto *Match = llvm::find_if( Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); if (Match != Succ->Preds.end()) { assert(Match->isArtificial()); @@ -639,8 +620,8 @@ bool PipelineSolver::solveExact() { : populateReadyList(ReadyList, CurrSU.second.begin(), CurrSU.second.end()); - auto I = ReadyList.begin(); - auto E = ReadyList.end(); + auto *I = ReadyList.begin(); + auto *E = ReadyList.end(); for (; I != E; ++I) { // If we are trying SGs in least cost order, and the current SG is cost // infeasible, then all subsequent SGs will also be cost infeasible, so we @@ -833,7 +814,8 @@ void PipelineSolver::solve() { enum IGLPStrategyID : int { MFMASmallGemmOptID = 0, MFMASmallGemmSingleWaveOptID = 1, - MFMAExpInterleave = 2 + MFMAExpInterleaveID = 2, + MFMAExpSimpleInterleaveID = 3 }; // Implement a IGLP scheduling strategy. @@ -942,7 +924,7 @@ private: bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, SmallVectorImpl<SchedGroup> &SyncPipe) override { - auto DAG = SyncPipe[0].DAG; + auto *DAG = SyncPipe[0].DAG; if (Cache->empty()) { auto I = DAG->SUnits.rbegin(); @@ -955,10 +937,9 @@ private: return false; } - auto Reaches = (std::any_of( - Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) { - return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU)); - })); + auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) { + return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU)); + }); return Reaches; } @@ -977,7 +958,7 @@ private: SmallVectorImpl<SchedGroup> &SyncPipe) override { bool FoundTrans = false; unsigned Counter = 1; - auto DAG = SyncPipe[0].DAG; + auto *DAG = SyncPipe[0].DAG; if (Cache->empty()) { SmallVector<SUnit *, 8> Worklist; @@ -1017,13 +998,13 @@ private: public: bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, SmallVectorImpl<SchedGroup> &SyncPipe) override { - auto DAG = SyncPipe[0].DAG; + auto *DAG = SyncPipe[0].DAG; if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) return false; if (Cache->empty()) { - auto TempSU = ChainSeed; + auto *TempSU = ChainSeed; auto Depth = Number; while (Depth > 0) { --Depth; @@ -1233,7 +1214,7 @@ private: if (!OtherGroup->Collection.size()) return true; - auto DAG = SyncPipe[0].DAG; + auto *DAG = SyncPipe[0].DAG; for (auto &OtherEle : OtherGroup->Collection) if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle)) @@ -1276,7 +1257,7 @@ private: return false; if (Cache->empty()) { - auto TempSU = ChainSeed; + auto *TempSU = ChainSeed; auto Depth = Number; while (Depth > 0) { --Depth; @@ -1316,7 +1297,7 @@ private: SmallVectorImpl<SchedGroup> &SyncPipe) override { SmallVector<SUnit *, 12> Worklist; - auto DAG = SyncPipe[0].DAG; + auto *DAG = SyncPipe[0].DAG; if (Cache->empty()) { for (auto &SU : DAG->SUnits) if (TII->isTRANS(SU.getInstr()->getOpcode())) { @@ -1430,19 +1411,16 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { if (!(TempExp && TempMFMA)) return false; - HasChainBetweenCvt = - std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(), - [&isCvt](SDep &Succ) { - return isCvt(Succ.getSUnit()->getInstr()->getOpcode()); - }) == (*TempExp)->Succs.end(); + HasChainBetweenCvt = none_of((*TempExp)->Succs, [&isCvt](SDep &Succ) { + return isCvt(Succ.getSUnit()->getInstr()->getOpcode()); + }); // Count the number of MFMAs that are reached by an EXP for (auto &SuccSU : MFMAPipeCands) { if (MFMAPipeSUs.size() && - std::find_if(MFMAPipeSUs.begin(), MFMAPipeSUs.end(), - [&SuccSU](SUnit *PotentialMatch) { - return PotentialMatch->NodeNum == SuccSU->NodeNum; - }) != MFMAPipeSUs.end()) + any_of(MFMAPipeSUs, [&SuccSU](SUnit *PotentialMatch) { + return PotentialMatch->NodeNum == SuccSU->NodeNum; + })) continue; for (auto &PredSU : ExpPipeCands) { @@ -1480,10 +1458,9 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { for (auto &MFMAPipeSU : MFMAPipeSUs) { if (is_contained(MFMAChainSeeds, MFMAPipeSU)) continue; - if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(), - [&TII](SDep &Succ) { - return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); - })) { + if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) { + return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); + })) { MFMAChainSeeds.push_back(MFMAPipeSU); ++MFMAChains; } @@ -1514,7 +1491,7 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { return isBitPack(Opc); }); - auto PackPred = + auto *PackPred = std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), [&isBitPack](SDep &Pred) { auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); @@ -1851,6 +1828,48 @@ bool MFMAExpInterleaveOpt::applyIGLPStrategy( return true; } +class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy { +public: + bool applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) override; + + bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, + AMDGPU::SchedulingPhase Phase) override { + return true; + } + + MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) + : IGLPStrategy(DAG, TII) { + IsBottomUp = true; + } +}; + +bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy( + DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, + DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, + AMDGPU::SchedulingPhase Phase) { + // Count the number of MFMA instructions. + unsigned MFMACount = 0; + for (const MachineInstr &I : *DAG) + if (TII->isMFMAorWMMA(I)) + ++MFMACount; + + const unsigned PipelineSyncID = 0; + for (unsigned I = 0; I < MFMACount * 3; ++I) { + SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + + SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( + SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); + SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); + } + + return true; +} + class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { private: // Whether the DS_READ is a predecessor of first four MFMA in region @@ -1873,7 +1892,7 @@ private: } assert(Cache->size()); - auto DAG = SyncPipe[0].DAG; + auto *DAG = SyncPipe[0].DAG; for (auto &Elt : *Cache) { if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) return true; @@ -1891,7 +1910,7 @@ private: public: bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, SmallVectorImpl<SchedGroup> &SyncPipe) override { - auto MI = SU->getInstr(); + auto *MI = SU->getInstr(); if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) return false; @@ -1942,14 +1961,10 @@ private: return true; // Does the previous VALU have this DS_Write as a successor - return (std::any_of(OtherGroup->Collection.begin(), - OtherGroup->Collection.end(), [&SU](SUnit *Elt) { - return std::any_of(Elt->Succs.begin(), - Elt->Succs.end(), - [&SU](SDep &Succ) { - return Succ.getSUnit() == SU; - }); - })); + return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) { + return any_of(Elt->Succs, + [&SU](SDep &Succ) { return Succ.getSUnit() == SU; }); + }); } IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) @@ -1961,7 +1976,7 @@ private: public: bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, SmallVectorImpl<SchedGroup> &SyncPipe) override { - auto MI = SU->getInstr(); + auto *MI = SU->getInstr(); if (MI->getOpcode() == TargetOpcode::BUNDLE) return false; if (!Collection.size()) @@ -2032,7 +2047,7 @@ private: return false; } - auto DAG = SyncPipe[0].DAG; + auto *DAG = SyncPipe[0].DAG; // Does the previous DS_WRITE share a V_PERM predecessor with this // VMEM_READ return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { @@ -2079,7 +2094,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( "DSWCounters should be zero in pre-RA scheduling!"); SmallVector<SUnit *, 6> DSWithPerms; for (auto &SU : DAG->SUnits) { - auto I = SU.getInstr(); + auto *I = SU.getInstr(); if (TII->isMFMAorWMMA(*I)) ++MFMACount; else if (TII->isDS(*I)) { @@ -2100,8 +2115,8 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( if (IsInitial) { DSWWithPermCount = DSWithPerms.size(); - auto I = DSWithPerms.begin(); - auto E = DSWithPerms.end(); + auto *I = DSWithPerms.begin(); + auto *E = DSWithPerms.end(); // Get the count of DS_WRITES with V_PERM predecessors which // have loop carried dependencies (WAR) on the same VMEM_READs. @@ -2122,7 +2137,7 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( break; for (auto &Succ : Pred.getSUnit()->Succs) { - auto MI = Succ.getSUnit()->getInstr(); + auto *MI = Succ.getSUnit()->getInstr(); if (!TII->isVMEM(*MI) || !MI->mayLoad()) continue; @@ -2132,13 +2147,13 @@ bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( continue; } - if (!VMEMLookup.contains(MI)) { + auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I); + if (Inserted) { MissedAny = true; - VMEMLookup[MI] = *I; continue; } - Cand = VMEMLookup[MI]; + Cand = It->second; if (llvm::is_contained(Counted, Cand)) { MissedAny = true; break; @@ -2318,8 +2333,10 @@ createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, return std::make_unique<MFMASmallGemmOpt>(DAG, TII); case MFMASmallGemmSingleWaveOptID: return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); - case MFMAExpInterleave: + case MFMAExpInterleaveID: return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII); + case MFMAExpSimpleInterleaveID: + return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII); } llvm_unreachable("Unknown IGLPStrategyID"); @@ -2576,7 +2593,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { initSchedGroupBarrierPipelineStage(R); FoundSB = true; } else if (Opc == AMDGPU::IGLP_OPT) { - resetEdges(*R, DAG); if (!FoundSB && !FoundIGLP) { FoundIGLP = true; ShouldApplyIGLP = initIGLPOpt(*R); @@ -2598,7 +2614,6 @@ void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); // Remove all existing edges from the SCHED_BARRIER that were added due to the // instruction having side effects. - resetEdges(SchedBarrier, DAG); LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " << MI.getOperand(0).getImm() << "\n"); auto InvertedMask = @@ -2656,7 +2671,6 @@ void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( std::vector<SUnit>::reverse_iterator RIter) { // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due // to the instruction having side effects. - resetEdges(*RIter, DAG); MachineInstr &SGB = *RIter->getInstr(); assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); int32_t SGMask = SGB.getOperand(0).getImm(); |
