aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp171
1 files changed, 115 insertions, 56 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e16bead81b65..b7d0f0580cda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -46,8 +46,7 @@ static cl::opt<bool> AllowRiskySelect(
AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
const AMDGPUTargetMachine &TM)
- : InstructionSelector(), TII(*STI.getInstrInfo()),
- TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
+ : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
STI(STI),
EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
#define GET_GLOBALISEL_PREDICATES_INIT
@@ -1103,7 +1102,18 @@ bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
Register SrcReg = I.getOperand(2).getReg();
unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
+
auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
+ if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(Pred))) {
+ MachineInstr *ICmp =
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
+
+ if (!RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
+ *TRI.getBoolRC(), *MRI))
+ return false;
+ I.eraseFromParent();
+ return true;
+ }
int Opcode = getV_CMPOpcode(Pred, Size);
if (Opcode == -1)
@@ -1234,7 +1244,7 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
// Get the return address reg and mark it as an implicit live-in
Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
- AMDGPU::SReg_64RegClass);
+ AMDGPU::SReg_64RegClass, DL);
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
.addReg(LiveIn);
I.eraseFromParent();
@@ -1494,9 +1504,9 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
if (TexFailCtrl)
IsTexFail = true;
- TFE = (TexFailCtrl & 0x1) ? 1 : 0;
+ TFE = (TexFailCtrl & 0x1) ? true : false;
TexFailCtrl &= ~(uint64_t)0x1;
- LWE = (TexFailCtrl & 0x2) ? 1 : 0;
+ LWE = (TexFailCtrl & 0x2) ? true : false;
TexFailCtrl &= ~(uint64_t)0x2;
return TexFailCtrl == 0;
@@ -1511,10 +1521,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
- const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
- AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
- const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
- AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
@@ -1523,7 +1529,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
Register VDataIn, VDataOut;
LLT VDataTy;
int NumVDataDwords = -1;
- bool IsD16 = false;
+ bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
+ MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
bool Unorm;
if (!BaseOpcode->Sampler)
@@ -1572,16 +1579,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
- // One memoperand is mandatory, except for getresinfo.
- // FIXME: Check this in verifier.
- if (!MI.memoperands_empty()) {
- const MachineMemOperand *MMO = *MI.memoperands_begin();
-
- // Infer d16 from the memory size, as the register type will be mangled by
- // unpacked subtargets, or by TFE.
- IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
- }
-
if (BaseOpcode->Store) {
VDataIn = MI.getOperand(1).getReg();
VDataTy = MRI->getType(VDataIn);
@@ -1596,26 +1593,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
}
}
- // Optimize _L to _LZ when _L is zero
- if (LZMappingInfo) {
- // The legalizer replaced the register with an immediate 0 if we need to
- // change the opcode.
- const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
- if (Lod.isImm()) {
- assert(Lod.getImm() == 0);
- IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
- }
- }
-
- // Optimize _mip away, when 'lod' is zero
- if (MIPMappingInfo) {
- const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
- if (Lod.isImm()) {
- assert(Lod.getImm() == 0);
- IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
- }
- }
-
// Set G16 opcode
if (IsG16 && !IsA16) {
const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
@@ -2562,6 +2539,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
Register MaskReg = I.getOperand(2).getReg();
LLT Ty = MRI->getType(DstReg);
LLT MaskTy = MRI->getType(MaskReg);
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -2570,6 +2549,24 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
if (DstRB != SrcRB) // Should only happen for hand written MIR.
return false;
+ // Try to avoid emitting a bit operation when we only need to touch half of
+ // the 64-bit pointer.
+ APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+ const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
+ const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
+
+ const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
+ const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
+
+ if (!IsVGPR && Ty.getSizeInBits() == 64 &&
+ !CanCopyLow32 && !CanCopyHi32) {
+ auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
+ .addReg(SrcReg)
+ .addReg(MaskReg);
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ }
+
unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
const TargetRegisterClass &RegRC
= IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
@@ -2586,8 +2583,6 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
!RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
return false;
- MachineBasicBlock *BB = I.getParent();
- const DebugLoc &DL = I.getDebugLoc();
if (Ty.getSizeInBits() == 32) {
assert(MaskTy.getSizeInBits() == 32 &&
"ptrmask should have been narrowed during legalize");
@@ -2610,13 +2605,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
Register MaskedLo, MaskedHi;
- // Try to avoid emitting a bit operation when we only need to touch half of
- // the 64-bit pointer.
- APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
-
- const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
- const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
- if ((MaskOnes & MaskLo32) == MaskLo32) {
+ if (CanCopyLow32) {
// If all the bits in the low half are 1, we only need a copy for it.
MaskedLo = LoReg;
} else {
@@ -2631,7 +2620,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
.addReg(MaskLo);
}
- if ((MaskOnes & MaskHi32) == MaskHi32) {
+ if (CanCopyHi32) {
// If all the bits in the high half are 1, we only need a copy for it.
MaskedHi = HiReg;
} else {
@@ -3123,6 +3112,33 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
return true;
}
+bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (IsVALU) {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
+ .addImm(Subtarget->getWavefrontSizeLog2())
+ .addReg(SrcReg);
+ } else {
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
+ .addReg(SrcReg)
+ .addImm(Subtarget->getWavefrontSizeLog2());
+ }
+
+ const TargetRegisterClass &RC =
+ IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+ if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
+ return false;
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
@@ -3236,7 +3252,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_SHUFFLE_VECTOR:
return selectG_SHUFFLE_VECTOR(I);
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
- case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
+ case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
const AMDGPU::ImageDimIntrinsicInfo *Intr
= AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
assert(Intr && "not an image intrinsic with image pseudo");
@@ -3252,6 +3270,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case AMDGPU::G_SI_CALL:
I.setDesc(TII.get(AMDGPU::SI_CALL));
return true;
+ case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
+ return selectWaveAddress(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -3896,20 +3916,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
}
+// Return the wave level SGPR base address if this is a wave address.
+static Register getWaveAddress(const MachineInstr *Def) {
+ return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
+ ? Def->getOperand(1).getReg()
+ : Register();
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffset(
MachineOperand &Root) const {
- MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
+ Register Reg = Root.getReg();
+ const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+ const MachineInstr *Def = MRI->getVRegDef(Reg);
+ if (Register WaveBase = getWaveAddress(Def)) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(WaveBase);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
+ }};
+ }
int64_t Offset = 0;
+
+ // FIXME: Copy check is a hack
+ Register BasePtr;
+ if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
+ if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+ return {};
+ const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
+ Register WaveBase = getWaveAddress(BasePtrDef);
+ if (!WaveBase)
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { // rsrc
+ MIB.addReg(Info->getScratchRSrcReg());
+ },
+ [=](MachineInstrBuilder &MIB) { // soffset
+ MIB.addReg(WaveBase);
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
+ }};
+ }
+
if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
return {};
- const MachineFunction *MF = MBB->getParent();
- const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-
return {{
[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());