aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp153
1 files changed, 99 insertions, 54 deletions
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index b39420f3c7db..493c1ad87f93 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -104,9 +104,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
unsigned BaseOff;
unsigned DMask;
InstClassEnum InstClass;
- bool GLC;
- bool SLC;
- bool DLC;
+ unsigned CPol = 0;
bool UseST64;
int AddrIdx[MaxAddressRegs];
const MachineOperand *AddrReg[MaxAddressRegs];
@@ -199,6 +197,7 @@ private:
const CombineInfo &Paired);
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
const CombineInfo &Paired);
+ const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
SmallVectorImpl<MachineInstr *> &InstsToMove);
@@ -304,6 +303,16 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
return 2;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
return 4;
+ case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ return 1;
+ case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return 2;
default:
return 0;
}
@@ -521,11 +530,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
Offset &= 0xffff;
} else if (InstClass != MIMG) {
- GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
- if (InstClass != S_BUFFER_LOAD_IMM) {
- SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
- }
- DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+ CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
}
AddressRegs Regs = getRegs(Opc, TII);
@@ -675,10 +680,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
return false;
// Check other optional immediate operands for equality.
- unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
- AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
- AMDGPU::OpName::da, AMDGPU::OpName::r128,
- AMDGPU::OpName::a16, AMDGPU::OpName::dlc};
+ unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
+ AMDGPU::OpName::unorm, AMDGPU::OpName::da,
+ AMDGPU::OpName::r128, AMDGPU::OpName::a16};
for (auto op : OperandsToMatch) {
int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
@@ -725,6 +729,16 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
return NewFormatInfo->Format;
}
+// Return the value in the inclusive range [Lo,Hi] that is aligned to the
+// highest power of two. Note that the result is well defined for all inputs
+// including corner cases like:
+// - if Lo == Hi, return that value
+// - if Lo == 0, return 0 (even though the "- 1" below underflows
+// - if Lo > Hi, return 0 (as if the range wrapped around)
+static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
+ return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
CombineInfo &Paired,
@@ -764,20 +778,19 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
return false;
}
- unsigned EltOffset0 = CI.Offset / CI.EltSize;
- unsigned EltOffset1 = Paired.Offset / CI.EltSize;
+ uint32_t EltOffset0 = CI.Offset / CI.EltSize;
+ uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
CI.UseST64 = false;
CI.BaseOff = 0;
- // Handle DS instructions.
+ // Handle all non-DS instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
return (EltOffset0 + CI.Width == EltOffset1 ||
EltOffset1 + Paired.Width == EltOffset0) &&
- CI.GLC == Paired.GLC && CI.DLC == Paired.DLC &&
- (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
+ CI.CPol == Paired.CPol &&
+ (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
}
- // Handle SMEM and VMEM instructions.
// If the offset in elements doesn't fit in 8-bits, we might be able to use
// the stride 64 versions.
if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
@@ -800,22 +813,36 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
}
// Try to shift base address to decrease offsets.
- unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
- CI.BaseOff = std::min(CI.Offset, Paired.Offset);
+ uint32_t Min = std::min(EltOffset0, EltOffset1);
+ uint32_t Max = std::max(EltOffset0, EltOffset1);
- if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
+ const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
+ if (((Max - Min) & ~Mask) == 0) {
if (Modify) {
- CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
- Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+ // From the range of values we could use for BaseOff, choose the one that
+ // is aligned to the highest power of two, to maximise the chance that
+ // the same offset can be reused for other load/store pairs.
+ uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
+ // Copy the low bits of the offsets, so that when we adjust them by
+ // subtracting BaseOff they will be multiples of 64.
+ BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
+ CI.BaseOff = BaseOff * CI.EltSize;
+ CI.Offset = (EltOffset0 - BaseOff) / 64;
+ Paired.Offset = (EltOffset1 - BaseOff) / 64;
CI.UseST64 = true;
}
return true;
}
- if (isUInt<8>(OffsetDiff)) {
+ if (isUInt<8>(Max - Min)) {
if (Modify) {
- CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
- Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+ // From the range of values we could use for BaseOff, choose the one that
+ // is aligned to the highest power of two, to maximise the chance that
+ // the same offset can be reused for other load/store pairs.
+ uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
+ CI.BaseOff = BaseOff * CI.EltSize;
+ CI.Offset = EltOffset0 - BaseOff;
+ Paired.Offset = EltOffset1 - BaseOff;
}
return true;
}
@@ -841,6 +868,26 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
}
}
+const TargetRegisterClass *
+SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
+ if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
+ return TRI->getRegClassForReg(*MRI, Dst->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
+ return TRI->getRegClassForReg(*MRI, Dst->getReg());
+ }
+ if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
+ return TRI->getRegClassForReg(*MRI, Src->getReg());
+ }
+ return nullptr;
+}
+
/// This function assumes that CI comes before Paired in a basic block.
bool SILoadStoreOptimizer::checkAndPrepareMerge(
CombineInfo &CI, CombineInfo &Paired,
@@ -873,6 +920,9 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
DenseSet<Register> PhysRegUsesToMove;
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
+ const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
+ bool IsAGPR = TRI->hasAGPRs(DataRC);
+
MachineBasicBlock::iterator E = std::next(Paired.I);
MachineBasicBlock::iterator MBBI = std::next(CI.I);
MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
@@ -941,6 +991,17 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
continue;
if (&*MBBI == &*Paired.I) {
+ if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
+ return false;
+ // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
+ // operands. However we are reporting that ds_write2 shall have
+ // only VGPR data so that machine copy propagation does not
+ // create an illegal instruction with a VGPR and AGPR sources.
+ // Consequenctially if we create such instruction the verifier
+ // will complain.
+ if (IsAGPR && CI.InstClass == DS_WRITE)
+ return false;
+
// We need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
@@ -1014,8 +1075,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
const MCInstrDesc &Read2Desc = TII->get(Opc);
- const TargetRegisterClass *SuperRC =
- (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
Register DestReg = MRI->createVirtualRegister(SuperRC);
DebugLoc DL = CI.I->getDebugLoc();
@@ -1229,8 +1289,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.DLC) // dlc
+ .addImm(CI.CPol) // cpol
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
@@ -1289,10 +1348,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1356,10 +1413,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(MergedOffset) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1436,10 +1491,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset, Paired.Offset)) // offset
.addImm(JoinedFormat) // format
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(
combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1536,18 +1589,12 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
case 16:
return &AMDGPU::SGPR_512RegClass;
}
- } else {
- switch (CI.Width + Paired.Width) {
- default:
- return nullptr;
- case 2:
- return &AMDGPU::VReg_64RegClass;
- case 3:
- return &AMDGPU::VReg_96RegClass;
- case 4:
- return &AMDGPU::VReg_128RegClass;
- }
}
+
+ unsigned BitWidth = 32 * (CI.Width + Paired.Width);
+ return TRI->hasAGPRs(getDataRegClass(*CI.I))
+ ? TRI->getAGPRClassForBitWidth(BitWidth)
+ : TRI->getVGPRClassForBitWidth(BitWidth);
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
@@ -1596,10 +1643,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
.addImm(std::min(CI.Offset, Paired.Offset)) // offset
- .addImm(CI.GLC) // glc
- .addImm(CI.SLC) // slc
+ .addImm(CI.CPol) // cpol
.addImm(0) // tfe
- .addImm(CI.DLC) // dlc
.addImm(0) // swz
.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
@@ -1671,7 +1716,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
- Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
MachineInstr *FullBase =
BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)