aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp309
1 files changed, 190 insertions, 119 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 66bc46aaefea..19a83ad53e2e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -12,6 +12,8 @@
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/InitializePasses.h"
using namespace llvm;
@@ -26,6 +28,10 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
const SIRegisterInfo *TRI = nullptr;
const SIInstrInfo *TII = nullptr;
const MachineRegisterInfo *MRI = nullptr;
+ MCRegister Exec;
+
+ DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+ SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
Register isCopyFromExec(const MachineInstr &MI) const;
Register isCopyToExec(const MachineInstr &MI) const;
@@ -44,13 +50,13 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
std::function<bool(MachineInstr *)> Pred,
ArrayRef<MCRegister> NonModifiableRegs,
unsigned MaxInstructions = 20) const;
- MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec,
- MCRegister Exec) const;
- bool optimizeExecSequence() const;
- bool optimizeVCmpxAndSaveexecSequence() const;
- bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
- MachineInstr &VCmp,
- MCRegister Exec) const;
+ bool optimizeExecSequence();
+ void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
+ bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+ MachineInstr &VCmp, MCRegister Exec) const;
+
+ void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
+ bool optimizeOrSaveexecXorSequences();
public:
static char ID;
@@ -92,7 +98,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B32_term: {
const MachineOperand &Src = MI.getOperand(1);
- if (Src.isReg() && Src.getReg() == TRI->getExec())
+ if (Src.isReg() && Src.getReg() == Exec)
return MI.getOperand(0).getReg();
}
}
@@ -107,8 +113,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
case AMDGPU::S_MOV_B64:
case AMDGPU::S_MOV_B32: {
const MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() && Dst.getReg() == TRI->getExec() &&
- MI.getOperand(1).isReg())
+ if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
return MI.getOperand(1).getReg();
break;
}
@@ -394,9 +399,7 @@ bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
// =>
// x = s_<op>_saveexec_b64 y
//
-bool SIOptimizeExecMasking::optimizeExecSequence() const {
- MCRegister Exec = TRI->getExec();
-
+bool SIOptimizeExecMasking::optimizeExecSequence() {
bool Changed = false;
for (MachineBasicBlock &MBB : *MF) {
MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
@@ -551,88 +554,9 @@ bool SIOptimizeExecMasking::optimizeExecSequence() const {
return Changed;
}
-// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
-// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
-// to the v_cmp instruction if it is safe to replace the sequence (see the
-// conditions in the function body). This is after register allocation, so some
-// checks on operand dependencies need to be considered.
-MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization(
- MachineInstr &SaveExec, MCRegister Exec) const {
-
- MachineInstr *VCmp = nullptr;
-
- Register SaveExecDest = SaveExec.getOperand(0).getReg();
- if (!TRI->isSGPRReg(*MRI, SaveExecDest))
- return nullptr;
-
- MachineOperand *SaveExecSrc0 =
- TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
- if (!SaveExecSrc0->isReg())
- return nullptr;
-
- // Try to find the last v_cmp instruction that defs the saveexec input
- // operand without any write to Exec or the saveexec input operand inbetween.
- VCmp = findInstrBackwards(
- SaveExec,
- [&](MachineInstr *Check) {
- return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
- Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
- },
- {Exec, SaveExecSrc0->getReg()});
-
- if (!VCmp)
- return nullptr;
-
- MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
- assert(VCmpDest && "Should have an sdst operand!");
-
- // Check if any of the v_cmp source operands is written by the saveexec.
- MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
- if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
- SaveExec.modifiesRegister(Src0->getReg(), TRI))
- return nullptr;
-
- MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
- if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
- SaveExec.modifiesRegister(Src1->getReg(), TRI))
- return nullptr;
-
- // Don't do the transformation if the destination operand is included in
- // it's MBB Live-outs, meaning it's used in any of it's successors, leading
- // to incorrect code if the v_cmp and therefore the def of
- // the dest operand is removed.
- if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
- return nullptr;
-
- // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
- // s_and_saveexec, skip the optimization.
- if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false,
- true) ||
- isRegisterInUseAfter(SaveExec, VCmpDest->getReg()))
- return nullptr;
-
- // Try to determine if there is a write to any of the VCmp
- // operands between the saveexec and the vcmp.
- // If yes, additional VGPR spilling might need to be inserted. In this case,
- // it's not worth replacing the instruction sequence.
- SmallVector<MCRegister, 2> NonDefRegs;
- if (Src0->isReg())
- NonDefRegs.push_back(Src0->getReg());
-
- if (Src1->isReg())
- NonDefRegs.push_back(Src1->getReg());
-
- if (!findInstrBackwards(
- SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
- NonDefRegs))
- return nullptr;
-
- return VCmp;
-}
-
// Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
// operands extracted from a v_cmp ..., s_and_saveexec pattern.
-bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
+bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
@@ -678,50 +602,164 @@ bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
if (Src1->isReg())
MRI->clearKillFlags(Src1->getReg());
+ SaveExecInstr.eraseFromParent();
+ VCmp.eraseFromParent();
+
return true;
}
-// After all s_op_saveexec instructions are inserted,
-// replace (on GFX10.3 and later)
+// Record (on GFX10.3 and later) occurences of
// v_cmp_* SGPR, IMM, VGPR
// s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
-// with
+// to be replaced with
// s_mov_b32 EXEC_SGPR_DEST, exec_lo
// v_cmpx_* IMM, VGPR
// to reduce pipeline stalls.
-bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const {
+void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
+ MachineInstr &MI) {
if (!ST->hasGFX10_3Insts())
- return false;
+ return;
- bool Changed = false;
-
- DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
- MCRegister Exec = TRI->getExec();
const unsigned AndSaveExecOpcode =
ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
- for (MachineBasicBlock &MBB : *MF) {
- for (MachineInstr &MI : MBB) {
- // Record relevant v_cmp / s_and_saveexec instruction pairs for
- // replacement.
- if (MI.getOpcode() != AndSaveExecOpcode)
- continue;
+ if (MI.getOpcode() != AndSaveExecOpcode)
+ return;
+
+ Register SaveExecDest = MI.getOperand(0).getReg();
+ if (!TRI->isSGPRReg(*MRI, SaveExecDest))
+ return;
- if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec))
- SaveExecVCmpMapping[&MI] = VCmp;
+ MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ if (!SaveExecSrc0->isReg())
+ return;
+
+ // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec
+ // sequence by looking at an instance of a s_and_saveexec instruction. Returns
+ // a pointer to the v_cmp instruction if it is safe to replace the sequence
+ // (see the conditions in the function body). This is after register
+ // allocation, so some checks on operand dependencies need to be considered.
+ MachineInstr *VCmp = nullptr;
+
+ // Try to find the last v_cmp instruction that defs the saveexec input
+ // operand without any write to Exec or the saveexec input operand inbetween.
+ VCmp = findInstrBackwards(
+ MI,
+ [&](MachineInstr *Check) {
+ return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+ Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+ },
+ {Exec, SaveExecSrc0->getReg()});
+
+ if (!VCmp)
+ return;
+
+ MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+ assert(VCmpDest && "Should have an sdst operand!");
+
+ // Check if any of the v_cmp source operands is written by the saveexec.
+ MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+ if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
+ MI.modifiesRegister(Src0->getReg(), TRI))
+ return;
+
+ MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+ if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
+ MI.modifiesRegister(Src1->getReg(), TRI))
+ return;
+
+ // Don't do the transformation if the destination operand is included in
+ // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+ // to incorrect code if the v_cmp and therefore the def of
+ // the dest operand is removed.
+ if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+ return;
+
+ // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
+ // s_and_saveexec, skip the optimization.
+ if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) ||
+ isRegisterInUseAfter(MI, VCmpDest->getReg()))
+ return;
+
+ // Try to determine if there is a write to any of the VCmp
+ // operands between the saveexec and the vcmp.
+ // If yes, additional VGPR spilling might need to be inserted. In this case,
+ // it's not worth replacing the instruction sequence.
+ SmallVector<MCRegister, 2> NonDefRegs;
+ if (Src0->isReg())
+ NonDefRegs.push_back(Src0->getReg());
+
+ if (Src1->isReg())
+ NonDefRegs.push_back(Src1->getReg());
+
+ if (!findInstrBackwards(
+ MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+ return;
+
+ if (VCmp)
+ SaveExecVCmpMapping[&MI] = VCmp;
+}
+
+// Record occurences of
+// s_or_saveexec s_o, s_i
+// s_xor exec, exec, s_o
+// to be replaced with
+// s_andn2_saveexec s_o, s_i.
+void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
+ const unsigned XorOpcode =
+ ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+
+ if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
+ const MachineOperand &XorDst = MI.getOperand(0);
+ const MachineOperand &XorSrc0 = MI.getOperand(1);
+ const MachineOperand &XorSrc1 = MI.getOperand(2);
+
+ if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
+ XorSrc1.isReg() &&
+ (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
+ const unsigned OrSaveexecOpcode = ST->isWave32()
+ ? AMDGPU::S_OR_SAVEEXEC_B32
+ : AMDGPU::S_OR_SAVEEXEC_B64;
+
+ // Peek at the previous instruction and check if this is a relevant
+ // s_or_saveexec instruction.
+ MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
+ if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
+ return;
+
+ const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
+ const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
+ if (OrDst.isReg() && OrSrc0.isReg()) {
+ if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
+ (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
+ OrXors.emplace_back(&PossibleOrSaveexec, &MI);
+ }
+ }
}
}
+}
- for (const auto &Entry : SaveExecVCmpMapping) {
- MachineInstr *SaveExecInstr = Entry.getFirst();
- MachineInstr *VCmpInstr = Entry.getSecond();
+bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
+ if (OrXors.empty()) {
+ return false;
+ }
- if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) {
- SaveExecInstr->eraseFromParent();
- VCmpInstr->eraseFromParent();
+ bool Changed = false;
+ const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
+ : AMDGPU::S_ANDN2_SAVEEXEC_B64;
- Changed = true;
- }
+ for (const auto &Pair : OrXors) {
+ MachineInstr *Or = nullptr;
+ MachineInstr *Xor = nullptr;
+ std::tie(Or, Xor) = Pair;
+ BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
+ TII->get(Andn2Opcode), Or->getOperand(0).getReg())
+ .addReg(Or->getOperand(1).getReg());
+
+ Or->eraseFromParent();
+ Xor->eraseFromParent();
+
+ Changed = true;
}
return Changed;
@@ -736,9 +774,42 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
TRI = ST->getRegisterInfo();
TII = ST->getInstrInfo();
MRI = &MF.getRegInfo();
+ Exec = TRI->getExec();
bool Changed = optimizeExecSequence();
- Changed |= optimizeVCmpxAndSaveexecSequence();
+
+ OrXors.clear();
+ SaveExecVCmpMapping.clear();
+ static unsigned SearchWindow = 10;
+ for (MachineBasicBlock &MBB : MF) {
+ unsigned SearchCount = 0;
+
+ for (auto &MI : llvm::reverse(MBB)) {
+ if (MI.isDebugInstr())
+ continue;
+
+ if (SearchCount >= SearchWindow) {
+ break;
+ }
+
+ tryRecordOrSaveexecXorSequence(MI);
+ tryRecordVCmpxAndSaveexecSequence(MI);
+
+ if (MI.modifiesRegister(Exec, TRI)) {
+ break;
+ }
+
+ ++SearchCount;
+ }
+ }
+
+ Changed |= optimizeOrSaveexecXorSequences();
+ for (const auto &Entry : SaveExecVCmpMapping) {
+ MachineInstr *SaveExecInstr = Entry.getFirst();
+ MachineInstr *VCmpInstr = Entry.getSecond();
+
+ Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
+ }
return Changed;
}