diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 425 |
1 files changed, 382 insertions, 43 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 5aba35a19ced..3f99d5cfb7f9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -244,7 +244,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, S32, S64, S16, V2S16 }; - setAction({G_BRCOND, S1}, Legal); + setAction({G_BRCOND, S1}, Legal); // VCC branches + setAction({G_BRCOND, S32}, Legal); // SCC branches // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more // elements for v3s16 @@ -272,6 +273,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } + // FIXME: Not really legal. Placeholder for custom lowering. + getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) + .legalFor({S32, S64}) + .clampScalar(0, S32, S64) + .widenScalarToNextPow2(0, 32) + .scalarize(0); + getActionDefinitionsBuilder({G_UMULH, G_SMULH}) .legalFor({S32}) .clampScalar(0, S32, S32) @@ -289,7 +297,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) - .legalFor({{S32, S1}}) + .legalFor({{S32, S1}, {S32, S32}}) .clampScalar(0, S32, S32) .scalarize(0); // TODO: Implement. @@ -354,6 +362,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasVOP3PInsts()) { MinNumMaxNum.customFor(FPTypesPK16) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .clampMaxNumElements(0, S16, 2) .clampScalar(0, S16, S64) .scalarize(0); @@ -438,13 +447,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, {S96, S32}, // FIXME: Hack {S64, LLT::scalar(33)}, - {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) - .scalarize(0); + {S32, S8}, {S32, LLT::scalar(24)}}) + .scalarize(0) + .clampScalar(0, S32, S64); // TODO: Split s1->s64 during regbankselect for VALU. auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) - .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) + .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) .lowerFor({{S32, S64}}) + .lowerIf(typeIs(1, S1)) .customFor({{S64, S64}}); if (ST.has16BitInsts()) IToFP.legalFor({{S16, S16}}); @@ -462,10 +473,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); getActionDefinitionsBuilder(G_INTRINSIC_ROUND) - .legalFor({S32, S64}) - .scalarize(0); + .scalarize(0) + .lower(); - if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + if (ST.has16BitInsts()) { + getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) + .legalFor({S16, S32, S64}) + .clampScalar(0, S16, S64) + .scalarize(0); + } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) .legalFor({S32, S64}) .clampScalar(0, S32, S64) @@ -478,7 +494,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } - getActionDefinitionsBuilder(G_GEP) + getActionDefinitionsBuilder(G_PTR_ADD) .legalForCartesianProduct(AddrSpaces64, {S64}) .legalForCartesianProduct(AddrSpaces32, {S32}) .scalarize(0); @@ -491,9 +507,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, auto &CmpBuilder = getActionDefinitionsBuilder(G_ICMP) + // The compare output type differs based on the register bank of the output, + // so make both s1 and s32 legal. + // + // Scalar compares producing output in scc will be promoted to s32, as that + // is the allocatable register type that will be needed for the copy from + // scc. This will be promoted during RegBankSelect, and we assume something + // before that won't try to use s32 result types. + // + // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg + // bank. .legalForCartesianProduct( {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) - .legalFor({{S1, S32}, {S1, S64}}); + .legalForCartesianProduct( + {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); if (ST.has16BitInsts()) { CmpBuilder.legalFor({{S1, S16}}); } @@ -502,7 +529,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(1) .clampScalar(1, S32, S64) .scalarize(0) - .legalIf(all(typeIs(0, S1), isPointer(1))); + .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); getActionDefinitionsBuilder(G_FCMP) .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) @@ -639,6 +666,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Split vector extloads. unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned Align = Query.MMODescrs[0].AlignInBits; + + if (MemSize < DstTy.getSizeInBits()) + MemSize = std::max(MemSize, Align); + if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) return true; @@ -653,7 +685,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) return true; - unsigned Align = Query.MMODescrs[0].AlignInBits; if (Align < MemSize) { const SITargetLowering *TLI = ST.getTargetLowering(); return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); @@ -795,14 +826,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, unsigned MemSize = Query.MMODescrs[0].SizeInBits; unsigned Align = Query.MMODescrs[0].AlignInBits; - // No extending vector loads. - if (Size > MemSize && Ty0.isVector()) - return false; - // FIXME: Widening store from alignment not valid. if (MemSize < Size) MemSize = std::max(MemSize, Align); + // No extending vector loads. + if (Size > MemSize && Ty0.isVector()) + return false; + switch (MemSize) { case 8: case 16: @@ -848,7 +879,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, - G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) + G_ATOMICRMW_UMIN}) .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S64, GlobalPtr}, {S64, LocalPtr}}); if (ST.hasFlatAddressSpace()) { @@ -858,14 +889,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_ATOMICRMW_FADD) .legalFor({{S32, LocalPtr}}); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output + // demarshalling + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) + .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, + {S32, FlatPtr}, {S64, FlatPtr}}) + .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, + {S32, RegionPtr}, {S64, RegionPtr}}); + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) .lower(); // TODO: Pointer types, any 32-bit or 64-bit vector + + // Condition should be s32 for scalar, s1 for vector. getActionDefinitionsBuilder(G_SELECT) .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, - LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) + LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) .clampScalar(0, S16, S64) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .fewerElementsIf(numElementsNotEven(0), scalarize(0)) @@ -875,7 +916,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampMaxNumElements(0, PrivatePtr, 2) .scalarize(0) .widenScalarToNextPow2(0) - .legalIf(all(isPointer(0), typeIs(1, S1))); + .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can // be more flexible with the shift amount type. @@ -888,7 +929,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } else Shifts.legalFor({{S16, S32}, {S16, S16}}); - Shifts.clampScalar(1, S16, S32); + // TODO: Support 16-bit shift amounts + Shifts.clampScalar(1, S32, S32); Shifts.clampScalar(0, S16, S64); Shifts.widenScalarToNextPow2(0, 16); } else { @@ -1075,6 +1117,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); + + getActionDefinitionsBuilder(G_READCYCLECOUNTER) + .legalFor({S64}); + + getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, + G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, + G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) + .unsupported(); + computeTables(); verify(*ST.getInstrInfo()); } @@ -1116,6 +1168,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, return legalizeFMad(MI, MRI, B); case TargetOpcode::G_FDIV: return legalizeFDIV(MI, MRI, B); + case TargetOpcode::G_ATOMIC_CMPXCHG: + return legalizeAtomicCmpXChg(MI, MRI, B); default: return false; } @@ -1175,12 +1229,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture( // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; - // FIXME: Don't use undef - Value *V = UndefValue::get(PointerType::get( - Type::getInt8Ty(MF.getFunction().getContext()), - AMDGPUAS::CONSTANT_ADDRESS)); - - MachinePointerInfo PtrInfo(V, StructOffset); + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad | @@ -1192,7 +1242,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register LoadResult = MRI.createGenericVirtualRegister(S32); Register LoadAddr; - B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); B.buildLoad(LoadResult, LoadAddr, *MMO); return LoadResult; } @@ -1709,13 +1759,15 @@ bool AMDGPULegalizerInfo::legalizeFMad( LLT Ty = MRI.getType(MI.getOperand(0).getReg()); assert(Ty.isScalar()); + MachineFunction &MF = B.getMF(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + // TODO: Always legal with future ftz flag. - if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) + if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) return true; - if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) + if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) return true; - MachineFunction &MF = B.getMF(); MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; @@ -1724,16 +1776,55 @@ bool AMDGPULegalizerInfo::legalizeFMad( return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; } +bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + Register DstReg = MI.getOperand(0).getReg(); + Register PtrReg = MI.getOperand(1).getReg(); + Register CmpVal = MI.getOperand(2).getReg(); + Register NewVal = MI.getOperand(3).getReg(); + + assert(SITargetLowering::isFlatGlobalAddrSpace( + MRI.getType(PtrReg).getAddressSpace()) && + "this should not have been custom lowered"); + + LLT ValTy = MRI.getType(CmpVal); + LLT VecTy = LLT::vector(2, ValTy); + + B.setInstr(MI); + Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); + + B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) + .addDef(DstReg) + .addUse(PtrReg) + .addUse(PackedVal) + .setMemRefs(MI.memoperands()); + + MI.eraseFromParent(); + return true; +} + // Return the use branch instruction, otherwise null if the usage is invalid. static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, - MachineRegisterInfo &MRI) { + MachineRegisterInfo &MRI, + MachineInstr *&Br) { Register CondDef = MI.getOperand(0).getReg(); if (!MRI.hasOneNonDBGUse(CondDef)) return nullptr; MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); - return UseMI.getParent() == MI.getParent() && - UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; + if (UseMI.getParent() != MI.getParent() || + UseMI.getOpcode() != AMDGPU::G_BRCOND) + return nullptr; + + // Make sure the cond br is followed by a G_BR + MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); + if (Next != MI.getParent()->end()) { + if (Next->getOpcode() != AMDGPU::G_BR) + return nullptr; + Br = &*Next; + } + + return &UseMI; } Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, @@ -1823,10 +1914,22 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { B.setInstr(MI); + Register Dst = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); if (legalizeFastUnsafeFDIV(MI, MRI, B)) return true; + if (DstTy == S16) + return legalizeFDIV16(MI, MRI, B); + if (DstTy == S32) + return legalizeFDIV32(MI, MRI, B); + if (DstTy == S64) + return legalizeFDIV64(MI, MRI, B); + return false; } @@ -1850,7 +1953,8 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) return false; - if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals()) + if (!Unsafe && ResTy == S32 && + MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) return false; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { @@ -1890,6 +1994,219 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, return false; } +bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + uint16_t Flags = MI.getFlags(); + + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + + auto LHSExt = B.buildFPExt(S32, LHS, Flags); + auto RHSExt = B.buildFPExt(S32, RHS, Flags); + + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) + .addUse(RHSExt.getReg(0)) + .setMIFlags(Flags); + + auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); + auto RDst = B.buildFPTrunc(S16, QUOT, Flags); + + B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) + .addUse(RDst.getReg(0)) + .addUse(RHS) + .addUse(LHS) + .setMIFlags(Flags); + + MI.eraseFromParent(); + return true; +} + +// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions +// to enable denorm mode. When 'Enable' is false, disable denorm mode. +static void toggleSPDenormMode(bool Enable, + MachineIRBuilder &B, + const GCNSubtarget &ST, + AMDGPU::SIModeRegisterDefaults Mode) { + // Set SP denorm mode to this value. + unsigned SPDenormMode = + Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + if (ST.hasDenormModeInst()) { + // Preserve default FP64FP16 denorm mode while updating FP32 mode. + unsigned DPDenormModeDefault = Mode.FP64FP16Denormals + ? FP_DENORM_FLUSH_NONE + : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); + B.buildInstr(AMDGPU::S_DENORM_MODE) + .addImm(NewDenormModeValue); + + } else { + // Select FP32 bit field in mode register. + unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | + (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | + (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); + + B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) + .addImm(SPDenormMode) + .addImm(SPDenormModeBitField); + } +} + +bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); + + uint16_t Flags = MI.getFlags(); + + LLT S32 = LLT::scalar(32); + LLT S1 = LLT::scalar(1); + + auto One = B.buildFConstant(S32, 1.0f); + + auto DenominatorScaled = + B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) + .addUse(RHS) + .addUse(LHS) + .addImm(1) + .setMIFlags(Flags); + auto NumeratorScaled = + B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) + .addUse(LHS) + .addUse(RHS) + .addImm(0) + .setMIFlags(Flags); + + auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) + .addUse(DenominatorScaled.getReg(0)) + .setMIFlags(Flags); + auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); + + // FIXME: Doesn't correctly model the FP mode switch, and the FP operations + // aren't modeled as reading it. + if (!Mode.FP32Denormals) + toggleSPDenormMode(true, B, ST, Mode); + + auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); + auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); + auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); + auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); + auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); + auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); + + if (!Mode.FP32Denormals) + toggleSPDenormMode(false, B, ST, Mode); + + auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) + .addUse(Fma4.getReg(0)) + .addUse(Fma1.getReg(0)) + .addUse(Fma3.getReg(0)) + .addUse(NumeratorScaled.getReg(1)) + .setMIFlags(Flags); + + B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) + .addUse(Fmas.getReg(0)) + .addUse(RHS) + .addUse(LHS) + .setMIFlags(Flags); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + B.setInstr(MI); + Register Res = MI.getOperand(0).getReg(); + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + uint16_t Flags = MI.getFlags(); + + LLT S64 = LLT::scalar(64); + LLT S1 = LLT::scalar(1); + + auto One = B.buildFConstant(S64, 1.0); + + auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) + .addUse(LHS) + .addUse(RHS) + .addImm(1) + .setMIFlags(Flags); + + auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); + + auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) + .addUse(DivScale0.getReg(0)) + .setMIFlags(Flags); + + auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); + auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); + auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); + + auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) + .addUse(LHS) + .addUse(RHS) + .addImm(0) + .setMIFlags(Flags); + + auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); + auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); + auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); + + Register Scale; + if (!ST.hasUsableDivScaleConditionOutput()) { + // Workaround a hardware bug on SI where the condition output from div_scale + // is not usable. + + Scale = MRI.createGenericVirtualRegister(S1); + + LLT S32 = LLT::scalar(32); + + auto NumUnmerge = B.buildUnmerge(S32, LHS); + auto DenUnmerge = B.buildUnmerge(S32, RHS); + auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); + auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); + + auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), + Scale1Unmerge.getReg(1)); + auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), + Scale0Unmerge.getReg(1)); + B.buildXor(Scale, CmpNum, CmpDen); + } else { + Scale = DivScale1.getReg(1); + } + + auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) + .addUse(Fma4.getReg(0)) + .addUse(Fma3.getReg(0)) + .addUse(Mul.getReg(0)) + .addUse(Scale) + .setMIFlags(Flags); + + B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) + .addUse(Fmas.getReg(0)) + .addUse(RHS) + .addUse(LHS) + .setMIFlags(Flags); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1955,7 +2272,7 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, if (!loadInputValue(KernargPtrReg, B, Arg)) return false; - B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); MI.eraseFromParent(); return true; } @@ -2032,19 +2349,38 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { // Replace the use G_BRCOND with the exec manipulate and branch pseudos. - switch (MI.getIntrinsicID()) { - case Intrinsic::amdgcn_if: { - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { + auto IntrID = MI.getIntrinsicID(); + switch (IntrID) { + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + MachineInstr *Br = nullptr; + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); B.setInstr(*BrCond); Register Def = MI.getOperand(1).getReg(); Register Use = MI.getOperand(3).getReg(); - B.buildInstr(AMDGPU::SI_IF) - .addDef(Def) - .addUse(Use) - .addMBB(BrCond->getOperand(1).getMBB()); + + MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); + if (Br) + BrTarget = Br->getOperand(0).getMBB(); + + if (IntrID == Intrinsic::amdgcn_if) { + B.buildInstr(AMDGPU::SI_IF) + .addDef(Def) + .addUse(Use) + .addMBB(BrTarget); + } else { + B.buildInstr(AMDGPU::SI_ELSE) + .addDef(Def) + .addUse(Use) + .addMBB(BrTarget) + .addImm(0); + } + + if (Br) + Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); @@ -2056,11 +2392,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, return false; } case Intrinsic::amdgcn_loop: { - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { + MachineInstr *Br = nullptr; + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); B.setInstr(*BrCond); + + // FIXME: Need to adjust branch targets based on unconditional branch. Register Reg = MI.getOperand(2).getReg(); B.buildInstr(AMDGPU::SI_LOOP) .addUse(Reg) |