aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp212
1 files changed, 212 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
new file mode 100644
index 000000000000..a5008e39d91a
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -0,0 +1,212 @@
+//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AMDGPU DAG scheduling
+/// mutation to pair VOPD instructions back to back. It also contains
+// subroutines useful in the creation of VOPD instructions
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNVOPDUtils.h"
+#include "AMDGPUSubtarget.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gcn-vopd-utils"
+
+bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
+ const MachineInstr &FirstMI,
+ const MachineInstr &SecondMI) {
+ const MachineFunction *MF = FirstMI.getMF();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned NumVGPRBanks = 4;
+ // Literals also count against scalar bus limit
+ SmallVector<const MachineOperand *> UniqueLiterals;
+ auto addLiteral = [&](const MachineOperand &Op) {
+ for (auto &Literal : UniqueLiterals) {
+ if (Literal->isIdenticalTo(Op))
+ return;
+ }
+ UniqueLiterals.push_back(&Op);
+ };
+ SmallVector<Register> UniqueScalarRegs;
+ assert([&]() -> bool {
+ for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
+ MII != FirstMI.getParent()->instr_end(); ++MII) {
+ if (&*MII == &SecondMI)
+ return true;
+ }
+ return false;
+ }() && "Expected FirstMI to precede SecondMI");
+ // Cannot pair dependent instructions
+ for (const auto &Use : SecondMI.uses())
+ if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg()))
+ return false;
+
+ struct ComponentInfo {
+ ComponentInfo(const MachineInstr &MI) : MI(MI) {}
+ Register Dst, Reg0, Reg1, Reg2;
+ const MachineInstr &MI;
+ };
+ ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)};
+
+ for (ComponentInfo &Comp : CInfo) {
+ switch (Comp.MI.getOpcode()) {
+ case AMDGPU::V_FMAMK_F32:
+ // cannot inline the fixed literal in fmamk
+ addLiteral(Comp.MI.getOperand(2));
+ Comp.Reg2 = Comp.MI.getOperand(3).getReg();
+ break;
+ case AMDGPU::V_FMAAK_F32:
+ // cannot inline the fixed literal in fmaak
+ addLiteral(Comp.MI.getOperand(3));
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ case AMDGPU::V_FMAC_F32_e32:
+ case AMDGPU::V_DOT2_F32_F16:
+ case AMDGPU::V_DOT2_F32_BF16:
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ Comp.Reg2 = Comp.MI.getOperand(0).getReg();
+ break;
+ case AMDGPU::V_CNDMASK_B32_e32:
+ UniqueScalarRegs.push_back(AMDGPU::VCC_LO);
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ case AMDGPU::V_MOV_B32_e32:
+ break;
+ default:
+ Comp.Reg1 = Comp.MI.getOperand(2).getReg();
+ break;
+ }
+
+ Comp.Dst = Comp.MI.getOperand(0).getReg();
+
+ const MachineOperand &Op0 = Comp.MI.getOperand(1);
+ if (Op0.isReg()) {
+ if (!TRI->isVectorRegister(MRI, Op0.getReg())) {
+ if (!is_contained(UniqueScalarRegs, Op0.getReg()))
+ UniqueScalarRegs.push_back(Op0.getReg());
+ } else
+ Comp.Reg0 = Op0.getReg();
+ } else {
+ if (!TII.isInlineConstant(Comp.MI, 1))
+ addLiteral(Op0);
+ }
+ }
+
+ if (UniqueLiterals.size() > 1)
+ return false;
+ if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
+ return false;
+
+ // check port 0
+ if (CInfo[0].Reg0 && CInfo[1].Reg0 &&
+ CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks)
+ return false;
+ // check port 1
+ if (CInfo[0].Reg1 && CInfo[1].Reg1 &&
+ CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks)
+ return false;
+ // check port 2
+ if (CInfo[0].Reg2 && CInfo[1].Reg2 &&
+ !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1))
+ return false;
+ if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
+ << "\n\tY: " << SecondMI << "\n");
+ return true;
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be scheduled
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const SIInstrInfo &STII = static_cast<const SIInstrInfo &>(TII);
+ unsigned Opc2 = SecondMI.getOpcode();
+ auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
+
+ // One instruction case
+ if (!FirstMI)
+ return SecondCanBeVOPD.Y;
+
+ unsigned Opc = FirstMI->getOpcode();
+ auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
+
+ if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) ||
+ (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
+ return false;
+
+ return checkVOPDRegConstraints(STII, *FirstMI, SecondMI);
+}
+
+/// Adapts design from MacroFusion
+/// Puts valid candidate instructions back-to-back so they can easily
+/// be turned into VOPD instructions
+/// Greedily pairs instruction candidates. O(n^2) algorithm.
+struct VOPDPairingMutation : ScheduleDAGMutation {
+ ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer
+
+ VOPDPairingMutation(
+ ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer
+ : shouldScheduleAdjacent(shouldScheduleAdjacent) {}
+
+ void apply(ScheduleDAGInstrs *DAG) override {
+ const TargetInstrInfo &TII = *DAG->TII;
+ const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
+ if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) {
+ LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n");
+ return;
+ }
+
+ std::vector<SUnit>::iterator ISUI, JSUI;
+ for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) {
+ const MachineInstr *IMI = ISUI->getInstr();
+ if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI))
+ continue;
+ if (!hasLessThanNumFused(*ISUI, 2))
+ continue;
+
+ for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) {
+ if (JSUI->isBoundaryNode())
+ continue;
+ const MachineInstr *JMI = JSUI->getInstr();
+ if (!hasLessThanNumFused(*JSUI, 2) ||
+ !shouldScheduleAdjacent(TII, ST, IMI, *JMI))
+ continue;
+ if (fuseInstructionPair(*DAG, *ISUI, *JSUI))
+ break;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n");
+ }
+};
+
+std::unique_ptr<ScheduleDAGMutation> llvm::createVOPDPairingMutation() {
+ return std::make_unique<VOPDPairingMutation>(shouldScheduleVOPDAdjacent);
+}