diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
commit | 344a3780b2e33f6ca763666c380202b18aab72a3 (patch) | |
tree | f0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AMDGPU/SIISelLowering.cpp | |
parent | b60736ec1405bb0a8dd40989f67ef4c93da068ab (diff) | |
download | src-344a3780b2e33f6ca763666c380202b18aab72a3.tar.gz src-344a3780b2e33f6ca763666c380202b18aab72a3.zip |
Vendor import of llvm-project main 88e66fa60ae5, the last commit beforevendor/llvm-project/llvmorg-13-init-16847-g88e66fa60ae5vendor/llvm-project/llvmorg-12.0.1-rc2-0-ge7dac564cd0evendor/llvm-project/llvmorg-12.0.1-0-gfed41342a82f
the upstream release/13.x branch was created.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1218 |
1 files changed, 801 insertions, 417 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 839437b5e3f8..d98acfc6c532 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19,11 +19,13 @@ #include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" #include "llvm/Support/CommandLine.h" @@ -80,36 +82,49 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); - addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + + const SIRegisterInfo *TRI = STI.getRegisterInfo(); + const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); + + addRegisterClass(MVT::f64, V64RegClass); + addRegisterClass(MVT::v2f32, V64RegClass); addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); - addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); - addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); + + addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass); + addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); + + addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass); + addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); + + addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass); + addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); - addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); - addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); - addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); @@ -123,7 +138,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -139,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v5i32, Custom); + setOperationAction(ISD::LOAD, MVT::v6i32, Custom); + setOperationAction(ISD::LOAD, MVT::v7i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); @@ -148,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v5i32, Custom); + setOperationAction(ISD::STORE, MVT::v6i32, Custom); + setOperationAction(ISD::STORE, MVT::v7i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -170,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand); + setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand); setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); @@ -197,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); @@ -239,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // with > 4 elements. for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, + MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { @@ -249,10 +279,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, case ISD::BITCAST: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: - case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: case ISD::SCALAR_TO_VECTOR: break; + case ISD::INSERT_SUBVECTOR: case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); break; @@ -284,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } + for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32); + } + for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); @@ -336,17 +380,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Avoid stack access for these. // TODO: Generalize to more vector types. + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom); @@ -362,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); - // Deal with vec5 vector operations when widened to vec8. + // Deal with vec5/6/7 vector operations when widened to vec8. setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); @@ -384,6 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); // FIXME: This should be narrowed to i32, but that only happens if i64 is // illegal. @@ -525,8 +571,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote); AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); - setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); - setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom); // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); @@ -718,6 +764,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FEXP, MVT::v2f16, Custom); setOperationAction(ISD::SELECT, MVT::v4i16, Custom); setOperationAction(ISD::SELECT, MVT::v4f16, Custom); + + if (Subtarget->hasPackedFP32Ops()) { + setOperationAction(ISD::FADD, MVT::v2f32, Legal); + setOperationAction(ISD::FMUL, MVT::v2f32, Legal); + setOperationAction(ISD::FMA, MVT::v2f32, Legal); + setOperationAction(ISD::FNEG, MVT::v2f32, Legal); + + for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) { + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FMA, VT, Custom); + } + } } setOperationAction(ISD::FNEG, MVT::v4f16, Custom); @@ -1128,17 +1187,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOVolatile; return true; } - case Intrinsic::amdgcn_global_atomic_fadd: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | - MachineMemOperand::MOStore | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOVolatile; - return true; - } case Intrinsic::amdgcn_image_bvh_intersect_ray: { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1150,6 +1198,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MODereferenceable; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1191,6 +1255,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); @@ -1210,9 +1277,9 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { } return AM.Scale == 0 && - (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( - AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, - /*Signed=*/false)); + (AM.BaseOffs == 0 || + Subtarget->getInstrInfo()->isLegalFLATOffset( + AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT)); } bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { @@ -1220,7 +1287,7 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { return AM.Scale == 0 && (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS, - /*Signed=*/true)); + SIInstrFlags::FlatGlobal)); if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { // Assume the we will use FLAT for all global memory accesses @@ -1385,10 +1452,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( return true; } + // Either, the alignment requirements are "enabled", or there is an + // unaligned LDS access related hardware bug though alignment requirements + // are "disabled". In either case, we need to check for proper alignment + // requirements. + // if (Size == 64) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. + // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we + // can do a 4 byte aligned, 8 byte access in a single operation using + // ds_read2/write2_b32 with adjacent offsets. bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; @@ -1396,22 +1468,23 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( return AlignedBy4; } if (Size == 96) { - // ds_read/write_b96 require 16-byte alignment on gfx8 and older. - bool Aligned = Alignment >= Align(16); + // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on + // gfx8 and older. + bool AlignedBy16 = Alignment >= Align(16); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy16; - return Aligned; + return AlignedBy16; } if (Size == 128) { - // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we - // can do a 8 byte aligned, 16 byte access in a single operation using - // ds_read2/write2_b64. - bool Aligned = Alignment >= Align(8); + // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on + // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a + // single operation using ds_read2/write2_b64. + bool AlignedBy8 = Alignment >= Align(8); if (IsFast) - *IsFast = Aligned; + *IsFast = AlignedBy8; - return Aligned; + return AlignedBy8; } } @@ -1467,8 +1540,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( } bool SITargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned AddrSpace, unsigned Alignment, - MachineMemOperand::Flags Flags, bool *IsFast) const { + EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, + bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1482,7 +1555,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, - Align(Alignment), Flags, IsFast); + Alignment, Flags, IsFast); } EVT SITargetLowering::getOptimalMemOpType( @@ -1535,8 +1608,8 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const { TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(MVT VT) const { - int NumElts = VT.getVectorNumElements(); - if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16)) + if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && + VT.getScalarType().bitsLE(MVT::i16)) return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -1799,23 +1872,37 @@ void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); - Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); + unsigned Mask = (Subtarget->hasPackedTID() && + Info.hasWorkItemIDY()) ? 0x3ff : ~0u; + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); } if (Info.hasWorkItemIDY()) { - Register Reg = AMDGPU::VGPR1; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + assert(Info.hasWorkItemIDX()); + if (Subtarget->hasPackedTID()) { + Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0, + 0x3ff << 10)); + } else { + unsigned Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); + } } if (Info.hasWorkItemIDZ()) { - Register Reg = AMDGPU::VGPR2; - MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); + assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY()); + if (Subtarget->hasPackedTID()) { + Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0, + 0x3ff << 20)); + } else { + unsigned Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); - CCInfo.AllocateReg(Reg); - Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); + } } } @@ -1865,12 +1952,32 @@ static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, return ArgDescriptor::createRegister(Reg); } -static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) { - return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); +// If this has a fixed position, we still should allocate the register in the +// CCInfo state. Technically we could get away with this for values passed +// outside of the normal argument range. +static void allocateFixedSGPRInputImpl(CCState &CCInfo, + const TargetRegisterClass *RC, + MCRegister Reg) { + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, RC); +} + +static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) { + if (Arg) { + allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, + Arg.getRegister()); + } else + Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); } -static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { - return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); +static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) { + if (Arg) { + allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, + Arg.getRegister()); + } else + Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } /// Allocate implicit function VGPR arguments at the end of allocated user @@ -1919,29 +2026,29 @@ void SITargetLowering::allocateSpecialInputSGPRs( // TODO: Unify handling with private memory pointers. if (Info.hasDispatchPtr()) - ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); if (Info.hasQueuePtr()) - ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a // constant offset from the kernarg segment. if (Info.hasImplicitArgPtr()) - ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); if (Info.hasDispatchID()) - ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); + allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. if (Info.hasWorkGroupIDX()) - ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo); + allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); if (Info.hasWorkGroupIDY()) - ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo); + allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); if (Info.hasWorkGroupIDZ()) - ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); + allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); } // Allocate special inputs passed in user SGPRs. @@ -2203,6 +2310,8 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } + Info->allocateModuleLDSGlobal(Fn.getParent()); + SmallVector<ISD::InputArg, 16> Splits; SmallVector<CCValAssign, 16> ArgLocs; BitVector Skipped(Ins.size()); @@ -2767,6 +2876,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) { static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { case CallingConv::C: + case CallingConv::AMDGPU_Gfx: return true; default: return canGuaranteeTCO(CC); @@ -2781,6 +2891,11 @@ bool SITargetLowering::isEligibleForTailCallOptimization( if (!mayTailCallThisCC(CalleeCC)) return false; + // For a divergent call target, we need to do a waterfall loop over the + // possible callees which precludes us from using a simple jump. + if (Callee->isDivergent()) + return false; + MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); @@ -2888,12 +3003,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (!CLI.CB) report_fatal_error("unsupported libcall legalization"); - if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) { - return lowerUnhandledCall(CLI, InVals, - "unsupported indirect call to function "); - } - if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) { return lowerUnhandledCall(CLI, InVals, "unsupported required tail call to function "); @@ -3054,7 +3163,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // locations, which are supposed to be immutable? Chain = addTokenForArgument(Chain, DAG, MFI, FI); } else { - DstAddr = PtrOff; + // Stores to the argument stack area are relative to the stack pointer. + SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), + MVT::i32); + DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff); DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); Alignment = commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); @@ -4150,11 +4262,35 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return BB; } case AMDGPU::DS_GWS_INIT: - case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_BARRIER: + if (Subtarget->needsAlignedVGPRs()) { + // Add implicit aligned super-reg to force alignment on the data operand. + const DebugLoc &DL = MI.getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + Register DataReg = Op->getReg(); + bool IsAGPR = TRI->isAGPR(MRI, DataReg); + Register Undef = MRI.createVirtualRegister( + IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef); + Register NewVR = + MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass + : &AMDGPU::VReg_64_Align2RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR) + .addReg(DataReg, 0, Op->getSubReg()) + .addImm(AMDGPU::sub0) + .addReg(Undef) + .addImm(AMDGPU::sub1); + Op->setReg(NewVR); + Op->setSubReg(AMDGPU::sub0); + MI.addOperand(MachineOperand::CreateReg(NewVR, false, true)); + } + LLVM_FALLTHROUGH; + case AMDGPU::DS_GWS_SEMA_V: case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: - case AMDGPU::DS_GWS_BARRIER: // A s_waitcnt 0 is required to be the instruction immediately following. if (getSubtarget()->hasGWSAutoReplay()) { bundleInstWithWaitcnt(MI); @@ -4360,7 +4496,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4381,7 +4518,8 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4456,6 +4594,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerFMINNUM_FMAXNUM(Op, DAG); case ISD::FMA: return splitTernaryVectorOp(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return LowerFP_TO_INT(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -5092,12 +5233,35 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { + if (!Subtarget->isTrapHandlerEnabled() || + Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) + return lowerTrapEndpgm(Op, DAG); + + if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) { + switch (*HsaAbiVer) { + case ELF::ELFABIVERSION_AMDGPU_HSA_V2: + case ELF::ELFABIVERSION_AMDGPU_HSA_V3: + return lowerTrapHsaQueuePtr(Op, DAG); + case ELF::ELFABIVERSION_AMDGPU_HSA_V4: + return Subtarget->supportsGetDoorbellID() ? + lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG); + } + } + + llvm_unreachable("Unknown trap handler"); +} + +SDValue SITargetLowering::lowerTrapEndpgm( + SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); + return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); +} - if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || - !Subtarget->isTrapHandlerEnabled()) - return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain); +SDValue SITargetLowering::lowerTrapHsaQueuePtr( + SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Chain = Op.getOperand(0); MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); @@ -5108,22 +5272,37 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue()); + + uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); SDValue Ops[] = { ToReg, - DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16), + DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01, ToReg.getValue(1) }; return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } +SDValue SITargetLowering::lowerTrapHsa( + SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue Chain = Op.getOperand(0); + + uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap); + SDValue Ops[] = { + Chain, + DAG.getTargetConstant(TrapID, SL, MVT::i16) + }; + return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); +} + SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); MachineFunction &MF = DAG.getMachineFunction(); - if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || - !Subtarget->isTrapHandlerEnabled()) { + if (!Subtarget->isTrapHandlerEnabled() || + Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { DiagnosticInfoUnsupported NoTrap(MF.getFunction(), "debugtrap handler not supported", Op.getDebugLoc(), @@ -5133,9 +5312,10 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const { return Chain; } + uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap); SDValue Ops[] = { Chain, - DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16) + DAG.getTargetConstant(TrapID, SL, MVT::i16) }; return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops); } @@ -5666,23 +5846,10 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef<SDValue> Elts) { assert(!Elts.empty()); MVT Type; - unsigned NumElts; - - if (Elts.size() == 1) { - Type = MVT::f32; - NumElts = 1; - } else if (Elts.size() == 2) { - Type = MVT::v2f32; - NumElts = 2; - } else if (Elts.size() == 3) { - Type = MVT::v3f32; - NumElts = 3; - } else if (Elts.size() <= 4) { - Type = MVT::v4f32; - NumElts = 4; - } else if (Elts.size() <= 8) { - Type = MVT::v8f32; - NumElts = 8; + unsigned NumElts = Elts.size(); + + if (NumElts <= 8) { + Type = MVT::getVectorVT(MVT::f32, NumElts); } else { assert(Elts.size() <= 16); Type = MVT::v16f32; @@ -5704,28 +5871,6 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, return DAG.getBuildVector(Type, DL, VecElts); } -static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, - SDValue *GLC, SDValue *SLC, SDValue *DLC) { - auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode()); - - uint64_t Value = CachePolicyConst->getZExtValue(); - SDLoc DL(CachePolicy); - if (GLC) { - *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x1; - } - if (SLC) { - *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x2; - } - if (DLC) { - *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32); - Value &= ~(uint64_t)0x4; - } - - return Value == 0; -} - static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts) { EVT SrcVT = Src.getValueType(); @@ -5752,7 +5897,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, ArrayRef<EVT> ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, - const SDLoc &DL, LLVMContext &Context) { + const SDLoc &DL) { // Determine the required return type. This is the same regardless of IsTexFail flag EVT ReqRetVT = ResultTypes[0]; int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; @@ -5835,11 +5980,11 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, return Value == 0; } -static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op, - MVT PackVectorVT, - SmallVectorImpl<SDValue> &PackedAddrs, - unsigned DimIdx, unsigned EndIdx, - unsigned NumGradients) { +static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op, + MVT PackVectorVT, + SmallVectorImpl<SDValue> &PackedAddrs, + unsigned DimIdx, unsigned EndIdx, + unsigned NumGradients) { SDLoc DL(Op); for (unsigned I = DimIdx; I < EndIdx; I++) { SDValue Addr = Op.getOperand(I); @@ -5994,56 +6139,64 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); MVT VAddrScalarVT = VAddrVT.getScalarType(); - MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); VAddrScalarVT = VAddrVT.getScalarType(); + MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; - if (IsA16 || IsG16) { - if (IsA16) { - if (!ST->hasA16()) { - LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit addresses\n"); - return Op; - } - if (!IsG16) { - LLVM_DEBUG( - dbgs() << "Failed to lower image intrinsic: 16 bit addresses " - "need 16 bit derivatives but got 32 bit derivatives\n"); - return Op; - } - } else if (!ST->hasG16()) { + + if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { + // 16 bit gradients are supported, but are tied to the A16 control + // so both gradients and addresses must be 16 bit + LLVM_DEBUG( + dbgs() << "Failed to lower image intrinsic: 16 bit addresses " + "require 16 bit args for both gradients and addresses"); + return Op; + } + + if (IsA16) { + if (!ST->hasA16()) { LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit derivatives\n"); + "support 16 bit addresses\n"); return Op; } + } - if (BaseOpcode->Gradients && !IsA16) { - if (!ST->hasG16()) { - LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " - "support 16 bit derivatives\n"); - return Op; - } - // Activate g16 - const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = - AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); - IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 - } + // We've dealt with incorrect input so we know that if IsA16, IsG16 + // are set then we have to compress/pack operands (either address, + // gradient or both) + // In the case where a16 and gradients are tied (no G16 support) then we + // have already verified that both IsA16 and IsG16 are true + if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { + // Activate g16 + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 + } - // Don't compress addresses for G16 - const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); - packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, - ArgOffset + Intr->GradientStart, PackEndIdx, - Intr->NumGradients); + // Add gradients (packed or unpacked) + if (IsG16) { + // Pack the gradients + // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); + packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs, + ArgOffset + Intr->GradientStart, + ArgOffset + Intr->CoordStart, Intr->NumGradients); + } else { + for (unsigned I = ArgOffset + Intr->GradientStart; + I < ArgOffset + Intr->CoordStart; I++) + VAddrs.push_back(Op.getOperand(I)); + } - if (!IsA16) { - // Add uncompressed address - for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) - VAddrs.push_back(Op.getOperand(I)); - } + // Add addresses (packed or unpacked) + if (IsA16) { + packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs, + ArgOffset + Intr->CoordStart, VAddrEnd, + 0 /* No gradients */); } else { - for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++) + // Add uncompressed address + for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) VAddrs.push_back(Op.getOperand(I)); } @@ -6058,8 +6211,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. - bool UseNSA = - ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3; + bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) && + VAddrs.size() >= 3 && + VAddrs.size() <= (unsigned)ST->getNSAMaxSize(); SDValue VAddr; if (!UseNSA) VAddr = getBuildDwordsVector(DAG, DL, VAddrs); @@ -6120,19 +6274,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } - SDValue GLC; - SDValue SLC; - SDValue DLC; - if (BaseOpcode->Atomic) { - GLC = True; // TODO no-return optimization - if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex), - DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr)) - return Op; - } else { - if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex), - DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr)) - return Op; - } + unsigned CPol = cast<ConstantSDNode>( + Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue(); + if (BaseOpcode->Atomic) + CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization + if (CPol & ~AMDGPU::CPol::ALL) + return Op; SmallVector<SDValue, 26> Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) @@ -6148,16 +6295,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (IsGFX10Plus) Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); Ops.push_back(Unorm); - if (IsGFX10Plus) - Ops.push_back(DLC); - Ops.push_back(GLC); - Ops.push_back(SLC); + Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32)); Ops.push_back(IsA16 && // r128, a16 for gfx9 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); if (IsGFX10Plus) Ops.push_back(IsA16 ? True : False); - Ops.push_back(TFE); - Ops.push_back(LWE); + if (!Subtarget->hasGFX90AInsts()) { + Ops.push_back(TFE); //tfe + } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) { + report_fatal_error("TFE is not supported on this GPU"); + } + Ops.push_back(LWE); // lwe if (!IsGFX10Plus) Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) @@ -6175,7 +6323,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op, : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); } else { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (Subtarget->hasGFX90AInsts()) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + report_fatal_error( + "requested image instruction is not supported on this GPU"); + } + if (Opcode == -1 && + Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, NumVDataDwords, NumVAddrDwords); if (Opcode == -1) @@ -6194,15 +6350,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<SDValue, 1> Elt; DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1); return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL); - } else if (!BaseOpcode->Store) { - return constructRetValue(DAG, NewNode, - OrigResultTypes, IsTexFail, - Subtarget->hasUnpackedD16VMem(), IsD16, - DMaskLanes, NumVDataDwords, DL, - *DAG.getContext()); } - - return SDValue(NewNode, 0); + if (BaseOpcode->Store) + return SDValue(NewNode, 0); + return constructRetValue(DAG, NewNode, + OrigResultTypes, IsTexFail, + Subtarget->hasUnpackedD16VMem(), IsD16, + DMaskLanes, NumVDataDwords, DL); } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, @@ -6448,11 +6602,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { - bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); - SDValue GLC; - SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); - if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, - IsGFX10Plus ? &DLC : nullptr)) + unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); + if (CPol & ~AMDGPU::CPol::ALL) return Op; return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), DAG); @@ -6607,6 +6758,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_alignbit: return DAG.getNode(ISD::FSHR, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_perm: + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { Module *M = const_cast<Module *>(MF.getFunction().getParent()); const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); @@ -6626,28 +6780,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } -// This function computes an appropriate offset to pass to -// MachineMemOperand::setOffset() based on the offset inputs to -// an intrinsic. If any of the offsets are non-contstant or -// if VIndex is non-zero then this function returns 0. Otherwise, -// it returns the sum of VOffset, SOffset, and Offset. -static unsigned getBufferOffsetForMMO(SDValue VOffset, - SDValue SOffset, - SDValue Offset, - SDValue VIndex = SDValue()) { - +/// Update \p MMO based on the offset inputs to an intrinsic. +static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset, + SDValue SOffset, SDValue Offset, + SDValue VIndex = SDValue()) { if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) || - !isa<ConstantSDNode>(Offset)) - return 0; + !isa<ConstantSDNode>(Offset)) { + // The combined offset is not known to be constant, so we cannot represent + // it in the MMO. Give up. + MMO->setValue((Value *)nullptr); + return; + } - if (VIndex) { - if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue()) - return 0; + if (VIndex && (!isa<ConstantSDNode>(VIndex) || + !cast<ConstantSDNode>(VIndex)->isNullValue())) { + // The strided index component of the address is not known to be zero, so we + // cannot represent it in the MMO. Give up. + MMO->setValue((Value *)nullptr); + return; } - return cast<ConstantSDNode>(VOffset)->getSExtValue() + - cast<ConstantSDNode>(SOffset)->getSExtValue() + - cast<ConstantSDNode>(Offset)->getSExtValue(); + MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() + + cast<ConstantSDNode>(SOffset)->getSExtValue() + + cast<ConstantSDNode>(Offset)->getSExtValue()); } SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, @@ -6670,13 +6825,21 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]); EVT MemVT = VData.getValueType(); return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, M->getMemOperand()); } +// Return a value to use for the idxen operand by examining the vindex operand. +static unsigned getIdxEn(SDValue VIndex) { + if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex)) + // No need to set idxen if vindex is known to be zero. + return VIndexC->getZExtValue() != 0; + return 1; +} + SDValue SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, unsigned NewOpcode) const { @@ -6697,8 +6860,7 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], - Ops[3])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); EVT MemVT = VData.getValueType(); return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, @@ -6811,9 +6973,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_load_format: { unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -6824,11 +6984,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - - unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; @@ -6836,7 +6992,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) @@ -6868,7 +7024,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5])); + updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]); return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: @@ -6888,8 +7044,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5], - Ops[2])); + updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]); return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { @@ -6900,9 +7055,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue(); unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(3)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // rsrc @@ -6983,9 +7136,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_xor: case Intrinsic::amdgcn_buffer_atomic_fadd: { unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // vdata @@ -6997,14 +7148,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); unsigned Opcode = 0; switch (IntrID) { @@ -7042,7 +7191,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; case Intrinsic::amdgcn_buffer_atomic_fadd: - if (!Op.getValue(0).use_empty()) { + if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { DiagnosticInfoUnsupported NoFpRet(DAG.getMachineFunction().getFunction(), "return versions of fp atomics not supported", @@ -7063,6 +7212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_struct_buffer_atomic_fadd: return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_raw_buffer_atomic_fmin: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); + case Intrinsic::amdgcn_struct_buffer_atomic_fmin: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN); + case Intrinsic::amdgcn_raw_buffer_atomic_fmax: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); + case Intrinsic::amdgcn_struct_buffer_atomic_fmax: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX); case Intrinsic::amdgcn_raw_buffer_atomic_swap: return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_raw_buffer_atomic_add: @@ -7119,9 +7276,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_cmpswap: { unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(5)); SDValue Ops[] = { Op.getOperand(0), // Chain Op.getOperand(2), // src @@ -7134,13 +7289,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -7161,7 +7314,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7])); + updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -7182,33 +7335,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7], - Ops[4])); + updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_global_atomic_fadd: { - if (!Op.getValue(0).use_empty()) { - DiagnosticInfoUnsupported - NoFpRet(DAG.getMachineFunction().getFunction(), - "return versions of fp atomics not supported", - DL.getDebugLoc(), DS_Error); - DAG.getContext()->diagnose(NoFpRet); - return SDValue(); - } - MemSDNode *M = cast<MemSDNode>(Op); - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - EVT VT = Op.getOperand(3).getValueType(); - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()); - } case Intrinsic::amdgcn_image_bvh_intersect_ray: { SDLoc DL(Op); MemSDNode *M = cast<MemSDNode>(Op); @@ -7224,6 +7355,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, assert(RayDir.getValueType() == MVT::v4f16 || RayDir.getValueType() == MVT::v4f32); + if (!Subtarget->hasGFX10_AEncoding()) { + emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); + return SDValue(); + } + bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; bool Is64 = NodePtr.getValueType() == MVT::i64; unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa @@ -7279,7 +7415,55 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, DAG.setNodeMemRefs(NewNode, {MemRef}); return SDValue(NewNode, 0); } + case Intrinsic::amdgcn_global_atomic_fadd: + if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + LLVM_FALLTHROUGH; + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmax: { + MemSDNode *M = cast<MemSDNode>(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + unsigned Opcode = 0; + switch (IntrID) { + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fadd: { + EVT VT = Op.getOperand(3).getValueType(); + return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, + DAG.getVTList(VT, MVT::Other), Ops, + M->getMemOperand()); + } + case Intrinsic::amdgcn_global_atomic_fmin: + case Intrinsic::amdgcn_flat_atomic_fmin: { + Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN; + break; + } + case Intrinsic::amdgcn_global_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmax: { + Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX; + break; + } + default: + llvm_unreachable("unhandled atomic opcode"); + } + return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op), + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) return lowerImage(Op, ImageDimIntr, DAG, true); @@ -7448,9 +7632,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue(); unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { Chain, VData, // vdata @@ -7461,7 +7643,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(7), // offset DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -7486,7 +7668,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.second, // offset Op.getOperand(7), // format Op.getOperand(8), // cachepolicy, swizzled buffer - DAG.getTargetConstant(1, DL, MVT::i1), // idexen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -7511,7 +7693,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.second, // offset Op.getOperand(6), // format Op.getOperand(7), // cachepolicy, swizzled buffer - DAG.getTargetConstant(0, DL, MVT::i1), // idexen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -7528,9 +7710,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, VData = handleD16VData(VData, DAG); unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; + unsigned IdxEn = getIdxEn(Op.getOperand(4)); SDValue Ops[] = { Chain, VData, @@ -7542,15 +7722,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -7597,7 +7775,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) @@ -7644,8 +7822,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], - Ops[3])); + updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -7725,9 +7902,9 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. -unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, SDValue *Offsets, - Align Alignment) const { +void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, + SelectionDAG &DAG, SDValue *Offsets, + Align Alignment) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); @@ -7737,7 +7914,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); - return SOffset + ImmOffset; + return; } } if (DAG.isBaseWithConstantOffset(CombinedOffset)) { @@ -7750,13 +7927,12 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); - return 0; + return; } } Offsets[0] = CombinedOffset; Offsets[1] = DAG.getConstant(0, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); - return 0; } // Handle 8 bit and 16 bit buffer loads @@ -8263,8 +8439,8 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { // Returns immediate value for setting the F32 denorm mode when using the // S_DENORM_MODE instruction. -static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, - const SDLoc &SL, const GCNSubtarget *ST) { +static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, + const SDLoc &SL, const GCNSubtarget *ST) { assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction()) ? FP_DENORM_FLUSH_NONE @@ -8794,18 +8970,20 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( } // Returns true if argument is a boolean value which is not serialized into -// memory or argument and does not require v_cmdmask_b32 to be deserialized. +// memory or argument and does not require v_cndmask_b32 to be deserialized. static bool isBoolSGPR(SDValue V) { if (V.getValueType() != MVT::i1) return false; switch (V.getOpcode()) { - default: break; + default: + break; case ISD::SETCC: + case AMDGPUISD::FP_CLASS: + return true; case ISD::AND: case ISD::OR: case ISD::XOR: - case AMDGPUISD::FP_CLASS: - return true; + return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1)); } return false; } @@ -9206,63 +9384,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return SDValue(); } -// Instructions that will be lowered with a final instruction that zeros the -// high result bits. -// XXX - probably only need to list legal operations. -static bool fp16SrcZerosHighBits(unsigned Opc) { - switch (Opc) { - case ISD::FADD: - case ISD::FSUB: - case ISD::FMUL: - case ISD::FDIV: - case ISD::FREM: - case ISD::FMA: - case ISD::FMAD: - case ISD::FCANONICALIZE: - case ISD::FP_ROUND: - case ISD::UINT_TO_FP: - case ISD::SINT_TO_FP: - case ISD::FABS: - // Fabs is lowered to a bit operation, but it's an and which will clear the - // high bits anyway. - case ISD::FSQRT: - case ISD::FSIN: - case ISD::FCOS: - case ISD::FPOWI: - case ISD::FPOW: - case ISD::FLOG: - case ISD::FLOG2: - case ISD::FLOG10: - case ISD::FEXP: - case ISD::FEXP2: - case ISD::FCEIL: - case ISD::FTRUNC: - case ISD::FRINT: - case ISD::FNEARBYINT: - case ISD::FROUND: - case ISD::FFLOOR: - case ISD::FMINNUM: - case ISD::FMAXNUM: - case AMDGPUISD::FRACT: - case AMDGPUISD::CLAMP: - case AMDGPUISD::COS_HW: - case AMDGPUISD::SIN_HW: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: - case AMDGPUISD::FMED3: - case AMDGPUISD::FMAD_FTZ: - case AMDGPUISD::RCP: - case AMDGPUISD::RSQ: - case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::LDEXP: - return true; - default: - // fcopysign, select and others may be lowered to 32-bit bit operations - // which don't zero the high bits. - return false; - } -} - SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (!Subtarget->has16BitInsts() || @@ -9277,15 +9398,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, if (Src.getValueType() != MVT::i16) return SDValue(); - // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src - // FIXME: It is not universally true that the high bits are zeroed on gfx9. - if (Src.getOpcode() == ISD::BITCAST) { - SDValue BCSrc = Src.getOperand(0); - if (BCSrc.getValueType() == MVT::f16 && - fp16SrcZerosHighBits(BCSrc.getOpcode())) - return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc); - } - return SDValue(); } @@ -9482,19 +9594,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, // Could be anything. return false; - case ISD::BITCAST: { + case ISD::BITCAST: + return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); + case ISD::TRUNCATE: { // Hack round the mess we make when legalizing extract_vector_elt - SDValue Src = Op.getOperand(0); - if (Src.getValueType() == MVT::i16 && - Src.getOpcode() == ISD::TRUNCATE) { - SDValue TruncSrc = Src.getOperand(0); + if (Op.getValueType() == MVT::i16) { + SDValue TruncSrc = Op.getOperand(0); if (TruncSrc.getValueType() == MVT::i32 && TruncSrc.getOpcode() == ISD::BITCAST && TruncSrc.getOperand(0).getValueType() == MVT::v2f16) { return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); } } - return false; } case ISD::INTRINSIC_WO_CHAIN: { @@ -9527,6 +9638,45 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, llvm_unreachable("invalid operation"); } +bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, + unsigned MaxDepth) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineInstr *MI = MRI.getVRegDef(Reg); + unsigned Opcode = MI->getOpcode(); + + if (Opcode == AMDGPU::G_FCANONICALIZE) + return true; + + if (Opcode == AMDGPU::G_FCONSTANT) { + auto F = MI->getOperand(1).getFPImm()->getValueAPF(); + if (F.isNaN() && F.isSignaling()) + return false; + return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF); + } + + if (MaxDepth == 0) + return false; + + switch (Opcode) { + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: { + if (Subtarget->supportsMinMaxDenormModes() || + denormalsEnabledForType(MRI.getType(Reg), MF)) + return true; + for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) { + if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1)) + return false; + } + return true; + } + default: + return denormalsEnabledForType(MRI.getType(Reg), MF) && + isKnownNeverSNaN(Reg, MRI); + } + + llvm_unreachable("invalid operation"); +} + // Constant fold canonicalize. SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { @@ -9694,15 +9844,19 @@ SDValue SITargetLowering::performIntMed3ImmCombine( } // If there isn't a 16-bit med3 operation, convert to 32-bit. - MVT NVT = MVT::i32; - unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + if (VT == MVT::i16) { + MVT NVT = MVT::i32; + unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); + } - SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); - return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); + return SDValue(); } static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { @@ -10408,7 +10562,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, EVT VT = N->getValueType(0); SDLoc SL(N); - if (!Subtarget->hasDot2Insts() || VT != MVT::f32) + if (!Subtarget->hasDot7Insts() || VT != MVT::f32) return SDValue(); // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> @@ -10791,7 +10945,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, unsigned NewDmask = 0; unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; - bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) || + bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || Node->getConstantOperandVal(LWEIdx)) ? 1 : 0; unsigned TFCLane = 0; bool HasChain = Node->getNumValues() > 1; @@ -11067,6 +11221,95 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, return Node; } +// Any MIMG instructions that use tfe or lwe require an initialization of the +// result register that will be written in the case of a memory access failure. +// The required code is also added to tie this init code to the result of the +// img instruction. +void SITargetLowering::AddIMGInit(MachineInstr &MI) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + MachineBasicBlock &MBB = *MI.getParent(); + + MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); + MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); + MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); + + if (!TFE && !LWE) // intersect_ray + return; + + unsigned TFEVal = TFE ? TFE->getImm() : 0; + unsigned LWEVal = LWE->getImm(); + unsigned D16Val = D16 ? D16->getImm() : 0; + + if (!TFEVal && !LWEVal) + return; + + // At least one of TFE or LWE are non-zero + // We have to insert a suitable initialization of the result value and + // tie this to the dest of the image instruction. + + const DebugLoc &DL = MI.getDebugLoc(); + + int DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); + + // Calculate which dword we have to initialize to 0. + MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); + + // check that dmask operand is found. + assert(MO_Dmask && "Expected dmask operand in instruction"); + + unsigned dmask = MO_Dmask->getImm(); + // Determine the number of active lanes taking into account the + // Gather4 special case + unsigned ActiveLanes = TII->isGather4(MI) ? 4 : countPopulation(dmask); + + bool Packed = !Subtarget->hasUnpackedD16VMem(); + + unsigned InitIdx = + D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; + + // Abandon attempt if the dst size isn't large enough + // - this is in fact an error but this is picked up elsewhere and + // reported correctly. + uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + if (DstSize < InitIdx) + return; + + // Create a register for the intialization value. + Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + unsigned NewDst = 0; // Final initialized value will be in here + + // If PRTStrictNull feature is enabled (the default) then initialize + // all the result registers to 0, otherwise just the error indication + // register (VGPRn+1) + unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; + unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); + for (; SizeLeft; SizeLeft--, CurrIdx++) { + NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + // Initialize dword + Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) + .addImm(0); + // Insert into the super-reg + BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) + .addReg(PrevDst) + .addReg(SubReg) + .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx)); + + PrevDst = NewDst; + } + + // Add as an implicit operand + MI.addOperand(MachineOperand::CreateReg(NewDst, false, true)); + + // Tie the just added implicit operand to the dst + MI.tieOperands(DstIdx, MI.getNumOperands() - 1); +} + /// Assign the register class depending on the number of /// bits set in the writemask void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, @@ -11114,10 +11357,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { if (!Node->hasAnyUseOfValue(0)) { - int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), - AMDGPU::OpName::glc1); - if (Glc1Idx != -1) - MI.RemoveOperand(Glc1Idx); + int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::cpol); + if (CPolIdx != -1) { + MachineOperand &CPol = MI.getOperand(CPolIdx); + CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC); + } MI.RemoveOperand(0); MI.setDesc(TII->get(NoRetAtomicOp)); return; @@ -11148,6 +11393,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, } return; } + + if (TII->isMIMG(MI) && !MI.mayStore()) + AddIMGInit(MI); } static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, @@ -11226,9 +11474,11 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, //===----------------------------------------------------------------------===// std::pair<unsigned, const TargetRegisterClass *> -SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, +SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_, StringRef Constraint, MVT VT) const { + const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_); + const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { const unsigned BitWidth = VT.getSizeInBits(); @@ -11257,7 +11507,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::VGPR_32RegClass; break; default: - RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth); + RC = TRI->getVGPRClassForBitWidth(BitWidth); if (!RC) return std::make_pair(0U, nullptr); break; @@ -11271,7 +11521,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::AGPR_32RegClass; break; default: - RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth); + RC = TRI->getAGPRClassForBitWidth(BitWidth); if (!RC) return std::make_pair(0U, nullptr); break; @@ -11444,6 +11694,47 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op, return false; } +static int getAlignedAGPRClassID(unsigned UnalignedClassID) { + switch (UnalignedClassID) { + case AMDGPU::VReg_64RegClassID: + return AMDGPU::VReg_64_Align2RegClassID; + case AMDGPU::VReg_96RegClassID: + return AMDGPU::VReg_96_Align2RegClassID; + case AMDGPU::VReg_128RegClassID: + return AMDGPU::VReg_128_Align2RegClassID; + case AMDGPU::VReg_160RegClassID: + return AMDGPU::VReg_160_Align2RegClassID; + case AMDGPU::VReg_192RegClassID: + return AMDGPU::VReg_192_Align2RegClassID; + case AMDGPU::VReg_224RegClassID: + return AMDGPU::VReg_224_Align2RegClassID; + case AMDGPU::VReg_256RegClassID: + return AMDGPU::VReg_256_Align2RegClassID; + case AMDGPU::VReg_512RegClassID: + return AMDGPU::VReg_512_Align2RegClassID; + case AMDGPU::VReg_1024RegClassID: + return AMDGPU::VReg_1024_Align2RegClassID; + case AMDGPU::AReg_64RegClassID: + return AMDGPU::AReg_64_Align2RegClassID; + case AMDGPU::AReg_96RegClassID: + return AMDGPU::AReg_96_Align2RegClassID; + case AMDGPU::AReg_128RegClassID: + return AMDGPU::AReg_128_Align2RegClassID; + case AMDGPU::AReg_160RegClassID: + return AMDGPU::AReg_160_Align2RegClassID; + case AMDGPU::AReg_192RegClassID: + return AMDGPU::AReg_192_Align2RegClassID; + case AMDGPU::AReg_256RegClassID: + return AMDGPU::AReg_256_Align2RegClassID; + case AMDGPU::AReg_512RegClassID: + return AMDGPU::AReg_512_Align2RegClassID; + case AMDGPU::AReg_1024RegClassID: + return AMDGPU::AReg_1024_Align2RegClassID; + default: + return -1; + } +} + // Figure out which registers should be reserved for stack access. Only after // the function is legalized do we know all of the non-spill stack objects or if // calls are present. @@ -11452,6 +11743,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. @@ -11474,7 +11766,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { - const SIInstrInfo *TII = ST.getInstrInfo(); for (auto &MBB : MF) { for (auto &MI : MBB) { TII->fixImplicitOperands(MI); @@ -11482,13 +11773,30 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } } + // FIXME: This is a hack to fixup AGPR classes to use the properly aligned + // classes if required. Ideally the register class constraints would differ + // per-subtarget, but there's no easy way to achieve that right now. This is + // not a problem for VGPRs because the correctly aligned VGPR class is implied + // from using them as the register class for legal types. + if (ST.needsAlignedVGPRs()) { + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const Register Reg = Register::index2VirtReg(I); + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + if (!RC) + continue; + int NewClassID = getAlignedAGPRClassID(RC->getID()); + if (NewClassID != -1) + MRI.setRegClass(Reg, TRI->getRegClass(NewClassID)); + } + } + TargetLoweringBase::finalizeLowering(MF); // Allocate a VGPR for future SGPR Spill if // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used // FIXME: We won't need this hack if we split SGPR allocation from VGPR - if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill && - !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects()) + if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() && + !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction()) Info->reserveVGPRforSGPRSpills(MF); } @@ -11690,8 +11998,37 @@ bool SITargetLowering::isSDNodeSourceOfDivergence( case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence( cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); + case AMDGPUISD::ATOMIC_CMP_SWAP: + case AMDGPUISD::ATOMIC_INC: + case AMDGPUISD::ATOMIC_DEC: + case AMDGPUISD::ATOMIC_LOAD_FMIN: + case AMDGPUISD::ATOMIC_LOAD_FMAX: + case AMDGPUISD::BUFFER_ATOMIC_SWAP: + case AMDGPUISD::BUFFER_ATOMIC_ADD: + case AMDGPUISD::BUFFER_ATOMIC_SUB: + case AMDGPUISD::BUFFER_ATOMIC_SMIN: + case AMDGPUISD::BUFFER_ATOMIC_UMIN: + case AMDGPUISD::BUFFER_ATOMIC_SMAX: + case AMDGPUISD::BUFFER_ATOMIC_UMAX: + case AMDGPUISD::BUFFER_ATOMIC_AND: + case AMDGPUISD::BUFFER_ATOMIC_OR: + case AMDGPUISD::BUFFER_ATOMIC_XOR: + case AMDGPUISD::BUFFER_ATOMIC_INC: + case AMDGPUISD::BUFFER_ATOMIC_DEC: + case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP: + case AMDGPUISD::BUFFER_ATOMIC_CSUB: + case AMDGPUISD::BUFFER_ATOMIC_FADD: + case AMDGPUISD::BUFFER_ATOMIC_FMIN: + case AMDGPUISD::BUFFER_ATOMIC_FMAX: + // Target-specific read-modify-write atomics are sources of divergence. + return true; + default: + if (auto *A = dyn_cast<AtomicSDNode>(N)) { + // Generic read-modify-write atomics are sources of divergence. + return A->readMem() && A->writeMem(); + } + return false; } - return false; } bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, @@ -11707,6 +12044,19 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, } } +bool SITargetLowering::denormalsEnabledForType(LLT Ty, + MachineFunction &MF) const { + switch (Ty.getScalarSizeInBits()) { + case 32: + return hasFP32Denormals(MF); + case 64: + case 16: + return hasFP64FP16Denormals(MF); + default: + return false; + } +} + bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN, @@ -11745,24 +12095,57 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { if (Ty->isHalfTy()) return AtomicExpansionKind::None; - if (!Ty->isFloatTy()) + if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; - // TODO: Do have these for flat. Older targets also had them for buffers. unsigned AS = RMW->getPointerAddressSpace(); - if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) { - if (!fpModeMatchesGlobalFPAtomicMode(RMW)) + if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && + Subtarget->hasAtomicFaddInsts()) { + // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe + // floating point atomic instructions. May generate more efficient code, + // but may not respect rounding and denormal modes, and may give incorrect + // results for certain memory destinations. + if (RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsString() != "true") + return AtomicExpansionKind::CmpXChg; + + if (Subtarget->hasGFX90AInsts()) { + if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) + return AtomicExpansionKind::CmpXChg; + + auto SSID = RMW->getSyncScopeID(); + if (SSID == SyncScope::System || + SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) + return AtomicExpansionKind::CmpXChg; + + return AtomicExpansionKind::None; + } + + if (AS == AMDGPUAS::FLAT_ADDRESS) return AtomicExpansionKind::CmpXChg; - return RMW->use_empty() ? AtomicExpansionKind::None : - AtomicExpansionKind::CmpXChg; + return RMW->use_empty() ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; } // DS FP atomics do repect the denormal mode, but the rounding mode is fixed // to round-to-nearest-even. - return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? - AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; + // The only exception is DS_ADD_F64 which never flushes regardless of mode. + if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) { + if (!Ty->isDoubleTy()) + return AtomicExpansionKind::None; + + return (fpModeMatchesGlobalFPAtomicMode(RMW) || + RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsString() == "true") + ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + } + + return AtomicExpansionKind::CmpXChg; } default: break; @@ -11872,10 +12255,11 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); } -std::pair<int, MVT> +std::pair<InstructionCost, MVT> SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const { - auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> Cost = + TargetLoweringBase::getTypeLegalizationCost(DL, Ty); auto Size = DL.getTypeSizeInBits(Ty); // Maximum load or store can handle 8 dwords for scalar and 4 for // vector ALU. Let's assume anything above 8 dwords is expensive |