diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2021-06-13 19:31:46 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2021-07-31 18:56:55 +0000 |
commit | af732203b8f7f006927528db5497f5cbc4c4742a (patch) | |
tree | 596f112de3b76118552871dbb6114bb7e3e17f40 /contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp | |
parent | 83dea422ac8d4a8323e64203c2eadaa813768717 (diff) | |
download | src-af732203b8f7f006927528db5497f5cbc4c4742a.tar.gz src-af732203b8f7f006927528db5497f5cbc4c4742a.zip |
Merge llvm-project 12.0.1 release and follow-up fixes
Merge llvm-project main llvmorg-12-init-17869-g8e464dd76bef
This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12-init-17869-g8e464dd76bef, the last commit before the
upstream release/12.x branch was created.
PR: 255570
(cherry picked from commit e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)
Merge llvm-project 12.0.0 release
This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12.0.0-0-gd28af7c654d8, a.k.a. 12.0.0 release.
PR: 255570
(cherry picked from commit d409305fa3838fb39b38c26fc085fb729b8766d5)
Disable strict-fp for powerpcspe, as it does not work properly yet
Merge commit 5c18d1136665 from llvm git (by Qiu Chaofan)
[SPE] Disable strict-fp for SPE by default
As discussed in PR50385, strict-fp on PowerPC SPE has not been
handled well. This patch disables it by default for SPE.
Reviewed By: nemanjai, vit9696, jhibbits
Differential Revision: https://reviews.llvm.org/D103235
PR: 255570
(cherry picked from commit 715df83abc049b23d9acddc81f2480bd4c056d64)
Apply upstream libc++ fix to allow building with devel/xxx-xtoolchain-gcc
Merge commit 52e9d80d5db2 from llvm git (by Jason Liu):
[libc++] add `inline` for __open's definition in ifstream and ofstream
Summary:
When building with gcc on AIX, it seems that gcc does not like the
`always_inline` without the `inline` keyword.
So adding the inline keywords in for __open in ifstream and ofstream.
That will also make it consistent with __open in basic_filebuf
(it seems we added `inline` there before for gcc build as well).
Differential Revision: https://reviews.llvm.org/D99422
PR: 255570
(cherry picked from commit d099db25464b826c5724cf2fb5b22292bbe15f6e)
Undefine HAVE_(DE)REGISTER_FRAME in llvm's config.h on arm
Otherwise, the lli tool (enable by WITH_CLANG_EXTRAS) won't link on arm,
stating that __register_frame is undefined. This function is normally
provided by libunwind, but explicitly not for the ARM Exception ABI.
Reported by: oh
PR: 255570
(cherry picked from commit f336b45e943c7f9a90ffcea1a6c4c7039e54c73c)
Merge llvm-project 12.0.1 rc2
This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12.0.1-rc2-0-ge7dac564cd0e, a.k.a. 12.0.1 rc2.
PR: 255570
(cherry picked from commit 23408297fbf3089f0388a8873b02fa75ab3f5bb9)
Revert libunwind change to fix backtrace segfault on aarch64
Revert commit 22b615a96593 from llvm git (by Daniel Kiss):
[libunwind] Support for leaf function unwinding.
Unwinding leaf function is useful in cases when the backtrace finds a
leaf function for example when it caused a signal.
This patch also add the support for the DW_CFA_undefined because it marks
the end of the frames.
Ryan Prichard provided code for the tests.
Reviewed By: #libunwind, mstorsjo
Differential Revision: https://reviews.llvm.org/D83573
Reland with limit the test to the x86_64-linux target.
Bisection has shown that this particular upstream commit causes programs
using backtrace(3) on aarch64 to segfault. This affects the lang/rust
port, for instance. Until we can upstream to fix this problem, revert
the commit for now.
Reported by: mikael
PR: 256864
(cherry picked from commit 5866c369e4fd917c0d456f0f10b92ee354b82279)
Merge llvm-project 12.0.1 release
This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and
openmp to llvmorg-12.0.1-0-gfed41342a82f, a.k.a. 12.0.1 release.
PR: 255570
(cherry picked from commit 4652422eb477731f284b1345afeefef7f269da50)
compilert-rt: build out-of-line LSE atomics helpers for aarch64
Both clang >= 12 and gcc >= 10.1 now default to -moutline-atomics for
aarch64. This requires a bunch of helper functions in libcompiler_rt.a,
to avoid link errors like "undefined symbol: __aarch64_ldadd8_acq_rel".
(Note: of course you can use -mno-outline-atomics as a workaround too,
but this would negate the potential performance benefit of the faster
LSE instructions.)
Bump __FreeBSD_version so ports maintainers can easily detect this.
PR: 257392
(cherry picked from commit cc55ee8009a550810d38777fd6ace9abf3a2f6b4)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2013 |
1 files changed, 1097 insertions, 916 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d035aa8f72bd..839437b5e3f8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13,72 +13,21 @@ #include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUSubtarget.h" +#include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/DAGCombine.h" -#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/CodeGen/TargetCallingConv.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugLoc.h" -#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Type.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CodeGen.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MachineValueType.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetOptions.h" -#include <cassert> -#include <cmath> -#include <cstdint> -#include <iterator> -#include <tuple> -#include <utility> -#include <vector> using namespace llvm; @@ -449,6 +398,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); setOperationAction(ISD::FLOG, MVT::f16, Custom); setOperationAction(ISD::FEXP, MVT::f16, Custom); setOperationAction(ISD::FLOG10, MVT::f16, Custom); @@ -486,6 +436,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->hasBFE()) setHasExtractBitsInsn(true); + // Clamp modifier on add/sub + if (Subtarget->hasIntClamp()) { + setOperationAction(ISD::UADDSAT, MVT::i32, Legal); + setOperationAction(ISD::USUBSAT, MVT::i32, Legal); + } + + if (Subtarget->hasAddNoCarry()) { + setOperationAction(ISD::SADDSAT, MVT::i16, Legal); + setOperationAction(ISD::SSUBSAT, MVT::i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::i32, Legal); + setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); + } + setOperationAction(ISD::FMINNUM, MVT::f32, Custom); setOperationAction(ISD::FMAXNUM, MVT::f32, Custom); setOperationAction(ISD::FMINNUM, MVT::f64, Custom); @@ -531,13 +494,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote); AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32); - setOperationAction(ISD::ROTR, MVT::i16, Promote); - setOperationAction(ISD::ROTL, MVT::i16, Promote); + setOperationAction(ISD::ROTR, MVT::i16, Expand); + setOperationAction(ISD::ROTL, MVT::i16, Expand); setOperationAction(ISD::SDIV, MVT::i16, Promote); setOperationAction(ISD::UDIV, MVT::i16, Promote); setOperationAction(ISD::SREM, MVT::i16, Promote); setOperationAction(ISD::UREM, MVT::i16, Promote); + setOperationAction(ISD::UADDSAT, MVT::i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::i16, Legal); setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); @@ -702,6 +667,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMAX, MVT::v2i16, Legal); setOperationAction(ISD::UMAX, MVT::v2i16, Legal); + setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal); + setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal); + setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal); + setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal); + setOperationAction(ISD::FADD, MVT::v2f16, Legal); setOperationAction(ISD::FMUL, MVT::v2f16, Legal); setOperationAction(ISD::FMA, MVT::v2f16, Legal); @@ -729,6 +699,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v4i16, Custom); setOperationAction(ISD::UMAX, MVT::v4i16, Custom); + setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom); + setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom); + setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom); + setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom); + setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); setOperationAction(ISD::FMA, MVT::v4f16, Custom); @@ -779,6 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); @@ -790,6 +767,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); @@ -844,6 +823,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); // FIXME: In other contexts we pretend this is a per-function property. setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); @@ -888,15 +869,18 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, if (VT.isVector()) { EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32) - return ScalarVT.getSimpleVT(); + if (Size == 16) { + if (Subtarget->has16BitInsts()) + return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + return VT.isInteger() ? MVT::i32 : MVT::f32; + } - if (Size > 32) - return MVT::i32; + if (Size < 16) + return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; + return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32; + } - if (Size == 16 && Subtarget->has16BitInsts()) - return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - } else if (VT.getSizeInBits() > 32) + if (VT.getSizeInBits() > 32) return MVT::i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); @@ -913,14 +897,15 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); - if (Size == 32) + // FIXME: Should probably promote 8-bit vectors to i16. + if (Size == 16 && Subtarget->has16BitInsts()) + return (NumElts + 1) / 2; + + if (Size <= 32) return NumElts; if (Size > 32) return NumElts * ((Size + 31) / 32); - - if (Size == 16 && Subtarget->has16BitInsts()) - return (NumElts + 1) / 2; } else if (VT.getSizeInBits() > 32) return (VT.getSizeInBits() + 31) / 32; @@ -935,6 +920,16 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); + // FIXME: We should fix the ABI to be the same on targets without 16-bit + // support, but unless we can properly handle 3-vectors, it will be still be + // inconsistent. + if (Size == 16 && Subtarget->has16BitInsts()) { + RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + IntermediateVT = RegisterVT; + NumIntermediates = (NumElts + 1) / 2; + return NumIntermediates; + } + if (Size == 32) { RegisterVT = ScalarVT.getSimpleVT(); IntermediateVT = RegisterVT; @@ -942,20 +937,26 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( return NumIntermediates; } - if (Size > 32) { + if (Size < 16 && Subtarget->has16BitInsts()) { + // FIXME: Should probably form v2i16 pieces + RegisterVT = MVT::i16; + IntermediateVT = ScalarVT; + NumIntermediates = NumElts; + return NumIntermediates; + } + + + if (Size != 16 && Size <= 32) { RegisterVT = MVT::i32; - IntermediateVT = RegisterVT; - NumIntermediates = NumElts * ((Size + 31) / 32); + IntermediateVT = ScalarVT; + NumIntermediates = NumElts; return NumIntermediates; } - // FIXME: We should fix the ABI to be the same on targets without 16-bit - // support, but unless we can properly handle 3-vectors, it will be still be - // inconsistent. - if (Size == 16 && Subtarget->has16BitInsts()) { - RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + if (Size > 32) { + RegisterVT = MVT::i32; IntermediateVT = RegisterVT; - NumIntermediates = (NumElts + 1) / 2; + NumIntermediates = NumElts * ((Size + 31) / 32); return NumIntermediates; } } @@ -1007,14 +1008,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); if (RsrcIntr->IsImage) { - Info.ptrVal = MFI->getImagePSV( - *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), - CI.getArgOperand(RsrcIntr->RsrcArg)); + Info.ptrVal = + MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); Info.align.reset(); } else { - Info.ptrVal = MFI->getBufferPSV( - *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), - CI.getArgOperand(RsrcIntr->RsrcArg)); + Info.ptrVal = + MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); } Info.flags = MachineMemOperand::MODereferenceable; @@ -1056,8 +1055,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags |= MachineMemOperand::MOStore; } else { // Atomic - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType()); + Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : + ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable; @@ -1091,11 +1091,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_buffer_atomic_fadd: { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - Info.opc = ISD::INTRINSIC_VOID; + Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); - Info.ptrVal = MFI->getBufferPSV( - *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), - CI.getArgOperand(1)); + Info.ptrVal = + MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; @@ -1105,16 +1104,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } - case Intrinsic::amdgcn_global_atomic_fadd: { - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MVT::getVT(CI.getOperand(0)->getType() - ->getPointerElementType()); - Info.ptrVal = CI.getOperand(0); - Info.align.reset(); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - - return true; - } case Intrinsic::amdgcn_ds_append: case Intrinsic::amdgcn_ds_consume: { Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -1136,10 +1125,31 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile; + return true; + } + case Intrinsic::amdgcn_global_atomic_fadd: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable | MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_image_bvh_intersect_ray: { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT? + Info.ptrVal = + MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MODereferenceable; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1175,9 +1185,13 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { + case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_global_atomic_fadd: + case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); Ops.push_back(Ptr); @@ -1234,7 +1248,7 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { // assume those use MUBUF instructions. Scratch loads / stores are currently // implemented as mubuf instructions with offen bit set, so slightly // different than the normal addr64. - if (!isUInt<12>(AM.BaseOffs)) + if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs)) return false; // FIXME: Since we can split immediate into soffset and immediate offset, @@ -1355,37 +1369,77 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, } bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( - unsigned Size, unsigned AddrSpace, unsigned Align, + unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, bool *IsFast) const { if (IsFast) *IsFast = false; if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { - // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte - // aligned, 8 byte access in a single operation using ds_read2/write2_b32 - // with adjacent offsets. - bool AlignedBy4 = (Align % 4 == 0); + // Check if alignment requirements for ds_read/write instructions are + // disabled. + if (Subtarget->hasUnalignedDSAccessEnabled() && + !Subtarget->hasLDSMisalignedBug()) { + if (IsFast) + *IsFast = Alignment != Align(2); + return true; + } + + if (Size == 64) { + // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte + // aligned, 8 byte access in a single operation using ds_read2/write2_b32 + // with adjacent offsets. + bool AlignedBy4 = Alignment >= Align(4); + if (IsFast) + *IsFast = AlignedBy4; + + return AlignedBy4; + } + if (Size == 96) { + // ds_read/write_b96 require 16-byte alignment on gfx8 and older. + bool Aligned = Alignment >= Align(16); + if (IsFast) + *IsFast = Aligned; + + return Aligned; + } + if (Size == 128) { + // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we + // can do a 8 byte aligned, 16 byte access in a single operation using + // ds_read2/write2_b64. + bool Aligned = Alignment >= Align(8); + if (IsFast) + *IsFast = Aligned; + + return Aligned; + } + } + + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { + bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; - return AlignedBy4; + return AlignedBy4 || + Subtarget->enableFlatScratch() || + Subtarget->hasUnalignedScratchAccess(); } // FIXME: We have to be conservative here and assume that flat operations // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. - if (!Subtarget->hasUnalignedScratchAccess() && - (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || - AddrSpace == AMDGPUAS::FLAT_ADDRESS)) { - bool AlignedBy4 = Align >= 4; + if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && + !Subtarget->hasUnalignedScratchAccess()) { + bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; return AlignedBy4; } - if (Subtarget->hasUnalignedBufferAccess()) { + if (Subtarget->hasUnalignedBufferAccessEnabled() && + !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS)) { // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { @@ -1393,7 +1447,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( // 2-byte alignment is worse than 1 unless doing a 2-byte accesss. *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? - Align >= 4 : Align != 2; + Alignment >= Align(4) : Alignment != Align(2); } return true; @@ -1409,12 +1463,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( if (IsFast) *IsFast = true; - return Size >= 32 && Align >= 4; + return Size >= 32 && Alignment >= Align(4); } bool SITargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, - bool *IsFast) const { + EVT VT, unsigned AddrSpace, unsigned Alignment, + MachineMemOperand::Flags Flags, bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1428,7 +1482,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, - Align, Flags, IsFast); + Align(Alignment), Flags, IsFast); } EVT SITargetLowering::getOptimalMemOpType( @@ -1449,11 +1503,6 @@ EVT SITargetLowering::getOptimalMemOpType( return MVT::Other; } -bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, - unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); -} - bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); @@ -1461,6 +1510,11 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { return I && I->getMetadata("amdgpu.noclobber"); } +bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS || + AS == AMDGPUAS::PRIVATE_ADDRESS; +} + bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { // Flat -> private/local is a simple truncate. @@ -1468,7 +1522,9 @@ bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, if (SrcAS == AMDGPUAS::FLAT_ADDRESS) return true; - return isNoopAddrSpaceCast(SrcAS, DestAS); + const GCNTargetMachine &TM = + static_cast<const GCNTargetMachine &>(getTargetMachine()); + return TM.isNoopAddrSpaceCast(SrcAS, DestAS); } bool SITargetLowering::isMemOpUniform(const SDNode *N) const { @@ -1537,7 +1593,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); - return DAG.getObjectPtrOffset(SL, BasePtr, Offset); + return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset)); } SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, @@ -1597,9 +1653,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( // TODO: If we passed in the base kernel offset we could have a better // alignment than 4, but we don't really need it. SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset); - SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4, + SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4), MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32); SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt); @@ -1682,12 +1738,11 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); } -static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, - CallingConv::ID CallConv, - ArrayRef<ISD::InputArg> Ins, - BitVector &Skipped, - FunctionType *FType, - SIMachineFunctionInfo *Info) { +static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, + CallingConv::ID CallConv, + ArrayRef<ISD::InputArg> Ins, BitVector &Skipped, + FunctionType *FType, + SIMachineFunctionInfo *Info) { for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; @@ -1895,26 +1950,26 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { if (Info.hasImplicitBufferPtr()) { - unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); + Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? if (Info.hasPrivateSegmentBuffer()) { - unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } if (Info.hasDispatchPtr()) { - unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); + Register DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } if (Info.hasQueuePtr()) { - unsigned QueuePtrReg = Info.addQueuePtr(TRI); + Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } @@ -1929,13 +1984,13 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, } if (Info.hasDispatchID()) { - unsigned DispatchIDReg = Info.addDispatchID(TRI); + Register DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info.hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); + if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { + Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } @@ -1951,25 +2006,25 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, CallingConv::ID CallConv, bool IsShader) const { if (Info.hasWorkGroupIDX()) { - unsigned Reg = Info.addWorkGroupIDX(); + Register Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDY()) { - unsigned Reg = Info.addWorkGroupIDY(); + Register Reg = Info.addWorkGroupIDY(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDZ()) { - unsigned Reg = Info.addWorkGroupIDZ(); + Register Reg = Info.addWorkGroupIDZ(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupInfo()) { - unsigned Reg = Info.addWorkGroupInfo(); + Register Reg = Info.addWorkGroupInfo(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } @@ -2020,26 +2075,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { - // If we have stack objects, we unquestionably need the private buffer - // resource. For the Code Object V2 ABI, this will be the first 4 user - // SGPR inputs. We can reserve those and use them directly. + if (!ST.enableFlatScratch()) { + if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. - Register PrivateSegmentBufferReg = - Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - Info.setScratchRSrcReg(PrivateSegmentBufferReg); - } else { - unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); - // We tentatively reserve the last registers (skipping the last registers - // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, - // we'll replace these with the ones immediately after those which were - // really allocated. In the prologue copies will be inserted from the - // argument to these reserved registers. + Register PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); + } else { + unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + // We tentatively reserve the last registers (skipping the last registers + // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, + // we'll replace these with the ones immediately after those which were + // really allocated. In the prologue copies will be inserted from the + // argument to these reserved registers. - // Without HSA, relocations are used for the scratch pointer and the - // buffer resource setup is always inserted in the prologue. Scratch wave - // offset is still in an input SGPR. - Info.setScratchRSrcReg(ReservedBufferReg); + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info.setScratchRSrcReg(ReservedBufferReg); + } } MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -2139,7 +2196,7 @@ SDValue SITargetLowering::LowerFormalArguments( FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { + if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) { DiagnosticInfoUnsupported NoGraphicsHSA( Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); @@ -2152,12 +2209,21 @@ SDValue SITargetLowering::LowerFormalArguments( CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); - bool IsShader = AMDGPU::isShader(CallConv); + bool IsGraphics = AMDGPU::isGraphics(CallConv); bool IsKernel = AMDGPU::isKernel(CallConv); bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); - if (IsShader) { - processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); + if (IsGraphics) { + assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && + (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); + } + + if (CallConv == CallingConv::AMDGPU_PS) { + processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); // At least one interpolation mode must be enabled or else the GPU will // hang. @@ -2172,39 +2238,28 @@ SDValue SITargetLowering::LowerFormalArguments( // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be // enabled too. - if (CallConv == CallingConv::AMDGPU_PS) { - if ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && - Info->isPSInputAllocated(11))) { - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - Info->markPSInputAllocated(0); - Info->markPSInputEnabled(0); - } - if (Subtarget->isAmdPalOS()) { - // For isAmdPalOS, the user does not enable some bits after compilation - // based on run-time states; the register values being generated here are - // the final ones set in hardware. Therefore we need to apply the - // workaround to PSInputAddr and PSInputEnable together. (The case where - // a bit is set in PSInputAddr but not PSInputEnable is where the - // frontend set up an input arg for a particular interpolation mode, but - // nothing uses that input arg. Really we should have an earlier pass - // that removes such an arg.) - unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); - if ((PsInputBits & 0x7F) == 0 || - ((PsInputBits & 0xF) == 0 && - (PsInputBits >> 11 & 1))) - Info->markPSInputEnabled( - countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); - } + if ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); + } + if (Subtarget->isAmdPalOS()) { + // For isAmdPalOS, the user does not enable some bits after compilation + // based on run-time states; the register values being generated here are + // the final ones set in hardware. Therefore we need to apply the + // workaround to PSInputAddr and PSInputEnable together. (The case where + // a bit is set in PSInputAddr but not PSInputEnable is where the + // frontend set up an input arg for a particular interpolation mode, but + // nothing uses that input arg. Really we should have an earlier pass + // that removes such an arg.) + unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); + if ((PsInputBits & 0x7F) == 0 || + ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1))) + Info->markPSInputEnabled( + countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); } - - assert(!Info->hasDispatchPtr() && - !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && - !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); } else if (IsKernel) { assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { @@ -2253,9 +2308,23 @@ SDValue SITargetLowering::LowerFormalArguments( const uint64_t Offset = VA.getLocMemOffset(); Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); - SDValue Arg = - lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment, - Ins[i].Flags.isSExt(), &Ins[i]); + if (Arg.Flags.isByRef()) { + SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset); + + const GCNTargetMachine &TM = + static_cast<const GCNTargetMachine &>(getTargetMachine()); + if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace())) { + Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS, + Arg.Flags.getPointerAddrSpace()); + } + + InVals.push_back(Ptr); + continue; + } + + SDValue Arg = lowerKernargMemParameter( + DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -2337,7 +2406,7 @@ SDValue SITargetLowering::LowerFormalArguments( // Start adding system SGPRs. if (IsEntryFunc) { - allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); + allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); @@ -2820,7 +2889,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, report_fatal_error("unsupported libcall legalization"); if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - !CLI.CB->getCalledFunction()) { + !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) { return lowerUnhandledCall(CLI, InVals, "unsupported indirect call to function "); } @@ -2830,11 +2899,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported required tail call to function "); } - if (AMDGPU::isShader(MF.getFunction().getCallingConv())) { - // Note the issue is with the CC of the calling function, not of the call + if (AMDGPU::isShader(CallConv)) { + // Note the issue is with the CC of the called function, not of the call // itself. return lowerUnhandledCall(CLI, InVals, - "unsupported call from graphics shader of function "); + "unsupported call to a shader function "); + } + + if (AMDGPU::isShader(MF.getFunction().getCallingConv()) && + CallConv != CallingConv::AMDGPU_Gfx) { + // Only allow calls with specific calling conventions. + return lowerUnhandledCall(CLI, InVals, + "unsupported calling convention for call from " + "graphics shader of function "); } if (IsTailCall) { @@ -2865,7 +2942,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (AMDGPUTargetMachine::EnableFixedFunctionABI) { + if (AMDGPUTargetMachine::EnableFixedFunctionABI && + CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -2894,14 +2972,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); - SmallVector<SDValue, 4> CopyFromChains; + if (!Subtarget->enableFlatScratch()) { + SmallVector<SDValue, 4> CopyFromChains; - // In the HSA case, this should be an identity copy. - SDValue ScratchRSrcReg - = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); - RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); - Chain = DAG.getTokenFactor(DL, CopyFromChains); + // In the HSA case, this should be an identity copy. + SDValue ScratchRSrcReg + = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); + RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); + Chain = DAG.getTokenFactor(DL, CopyFromChains); + } } MVT PtrVT = MVT::i32; @@ -2992,14 +3072,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MemOpChains.push_back(Cpy); } else { - SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, - Alignment ? Alignment->value() : 0); + SDValue Store = + DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment); MemOpChains.push_back(Store); } } } - if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { + if (!AMDGPUTargetMachine::EnableFixedFunctionABI && + CallConv != CallingConv::AMDGPU_Gfx) { // Copy special input registers after user input arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3223,29 +3304,11 @@ Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, // If kill is not the last instruction, split the block so kill is always a // proper terminator. -MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, - MachineBasicBlock *BB) const { +MachineBasicBlock * +SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - - MachineBasicBlock::iterator SplitPoint(&MI); - ++SplitPoint; - - if (SplitPoint == BB->end()) { - // Don't bother with a new block. - MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); - return BB; - } - - MachineFunction *MF = BB->getParent(); - MachineBasicBlock *SplitBB - = MF->CreateMachineBasicBlock(BB->getBasicBlock()); - - MF->insert(++MachineFunction::iterator(BB), SplitBB); - SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); - - SplitBB->transferSuccessorsAndUpdatePHIs(BB); - BB->addSuccessor(SplitBB); - MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return SplitBB; } @@ -3357,20 +3420,14 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, // will only do one iteration. In the worst case, this will loop 64 times. // // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value. -static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( - const SIInstrInfo *TII, - MachineRegisterInfo &MRI, - MachineBasicBlock &OrigBB, - MachineBasicBlock &LoopBB, - const DebugLoc &DL, - const MachineOperand &IdxReg, - unsigned InitReg, - unsigned ResultReg, - unsigned PhiReg, - unsigned InitSaveExecReg, - int Offset, - bool UseGPRIdxMode, - bool IsIndirectSrc) { +static MachineBasicBlock::iterator +emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, + MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, + const DebugLoc &DL, const MachineOperand &Idx, + unsigned InitReg, unsigned ResultReg, unsigned PhiReg, + unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, + Register &SGPRIdxReg) { + MachineFunction *MF = OrigBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -3396,12 +3453,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( // Read the next variant <- also loop target. BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) - .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef())); // Compare the just read M0 value to all possible Idx values. BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) - .addReg(CurrentIdxReg) - .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); + .addReg(CurrentIdxReg) + .addReg(Idx.getReg(), 0, Idx.getSubReg()); // Update EXEC, save the original EXEC value to VCC. BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 @@ -3412,22 +3469,14 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( MRI.setSimpleHint(NewExec, CondReg); if (UseGPRIdxMode) { - unsigned IdxReg; if (Offset == 0) { - IdxReg = CurrentIdxReg; + SGPRIdxReg = CurrentIdxReg; } else { - IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg) - .addReg(CurrentIdxReg, RegState::Kill) - .addImm(Offset); + SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg) + .addReg(CurrentIdxReg, RegState::Kill) + .addImm(Offset); } - unsigned IdxMode = IsIndirectSrc ? - AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; - MachineInstr *SetOn = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addReg(IdxReg, RegState::Kill) - .addImm(IdxMode); - SetOn->getOperand(3).setIsUndef(); } else { // Move index from VCC into M0 if (Offset == 0) { @@ -3463,14 +3512,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( // per-workitem, so is kept alive for the whole loop so we end up not re-using a // subregister from it, using 1 more VGPR than necessary. This was saved when // this was expanded after register allocation. -static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, - MachineBasicBlock &MBB, - MachineInstr &MI, - unsigned InitResultReg, - unsigned PhiReg, - int Offset, - bool UseGPRIdxMode, - bool IsIndirectSrc) { +static MachineBasicBlock::iterator +loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, + unsigned InitResultReg, unsigned PhiReg, int Offset, + bool UseGPRIdxMode, Register &SGPRIdxReg) { MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -3499,7 +3544,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, InitResultReg, DstReg, PhiReg, TmpExec, - Offset, UseGPRIdxMode, IsIndirectSrc); + Offset, UseGPRIdxMode, SGPRIdxReg); + MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock(); MachineFunction::iterator MBBI(LoopBB); ++MBBI; @@ -3530,64 +3576,45 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI, return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0); } -// Return true if the index is an SGPR and was set. -static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, - MachineRegisterInfo &MRI, - MachineInstr &MI, - int Offset, - bool UseGPRIdxMode, - bool IsIndirectSrc) { +static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, + MachineRegisterInfo &MRI, MachineInstr &MI, + int Offset) { MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); assert(Idx->getReg() != AMDGPU::NoRegister); - if (!TII->getRegisterInfo().isSGPRClass(IdxRC)) - return false; - - if (UseGPRIdxMode) { - unsigned IdxMode = IsIndirectSrc ? - AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; - if (Offset == 0) { - MachineInstr *SetOn = - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .add(*Idx) - .addImm(IdxMode); + if (Offset == 0) { + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx); + } else { + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .add(*Idx) + .addImm(Offset); + } +} - SetOn->getOperand(3).setIsUndef(); - } else { - Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) - .add(*Idx) - .addImm(Offset); - MachineInstr *SetOn = - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) - .addReg(Tmp, RegState::Kill) - .addImm(IdxMode); +static Register getIndirectSGPRIdx(const SIInstrInfo *TII, + MachineRegisterInfo &MRI, MachineInstr &MI, + int Offset) { + MachineBasicBlock *MBB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator I(&MI); - SetOn->getOperand(3).setIsUndef(); - } + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - return true; - } + if (Offset == 0) + return Idx->getReg(); - if (Offset == 0) { - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .add(*Idx); - } else { - BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) .add(*Idx) .addImm(Offset); - } - - return true; + return Tmp; } -// Control flow needs to be inserted if indexing with a VGPR. static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST) { @@ -3597,10 +3624,12 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, MachineRegisterInfo &MRI = MF->getRegInfo(); Register Dst = MI.getOperand(0).getReg(); + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); unsigned SubReg; std::tie(SubReg, Offset) @@ -3608,7 +3637,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, const bool UseGPRIdxMode = ST.useVGPRIndexMode(); - if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { + // Check for a SGPR index. + if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); @@ -3616,14 +3646,19 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, // TODO: Look at the uses to avoid the copy. This may require rescheduling // to avoid interfering with other uses, so probably requires a new // optimization pass. - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) - .addReg(SrcReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); + + const MCInstrDesc &GPRIDXDesc = + TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); + BuildMI(MBB, I, DL, GPRIDXDesc, Dst) + .addReg(SrcReg) + .addReg(Idx) + .addImm(SubReg); } else { + setM0ToIndexFromSGPR(TII, MRI, MI, Offset); + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit); } @@ -3632,6 +3667,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return &MBB; } + // Control flow needs to be inserted if indexing with a VGPR. const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); @@ -3640,19 +3676,23 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); - auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, - Offset, UseGPRIdxMode, true); + Register SGPRIdxReg; + auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, + UseGPRIdxMode, SGPRIdxReg); + MachineBasicBlock *LoopBB = InsPt->getParent(); if (UseGPRIdxMode) { - BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) - .addReg(SrcReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); - BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + const MCInstrDesc &GPRIDXDesc = + TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); + + BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) + .addReg(SrcReg) + .addReg(SGPRIdxReg) + .addImm(SubReg); } else { BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(SrcReg, RegState::Undef, SubReg) + .addReg(SrcReg, 0, SubReg) .addReg(SrcReg, RegState::Implicit); } @@ -3675,6 +3715,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); + const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); // This can be an immediate, but will be folded later. assert(Val->getReg()); @@ -3700,23 +3741,36 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return &MBB; } - const MCInstrDesc &MovRelDesc - = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false); - - if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { + // Check for a SGPR index. + if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); - BuildMI(MBB, I, DL, MovRelDesc, Dst) - .addReg(SrcVec->getReg()) - .add(*Val) - .addImm(SubReg); - if (UseGPRIdxMode) - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + if (UseGPRIdxMode) { + Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset); + + const MCInstrDesc &GPRIDXDesc = + TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); + BuildMI(MBB, I, DL, GPRIDXDesc, Dst) + .addReg(SrcVec->getReg()) + .add(*Val) + .addReg(Idx) + .addImm(SubReg); + } else { + setM0ToIndexFromSGPR(TII, MRI, MI, Offset); + + const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( + TRI.getRegSizeInBits(*VecRC), 32, false); + BuildMI(MBB, I, DL, MovRelDesc, Dst) + .addReg(SrcVec->getReg()) + .add(*Val) + .addImm(SubReg); + } MI.eraseFromParent(); return &MBB; } + // Control flow needs to be inserted if indexing with a VGPR. if (Val->isReg()) MRI.clearKillFlags(Val->getReg()); @@ -3724,16 +3778,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, Register PhiReg = MRI.createVirtualRegister(VecRC); - auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, - Offset, UseGPRIdxMode, false); + Register SGPRIdxReg; + auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, + UseGPRIdxMode, SGPRIdxReg); MachineBasicBlock *LoopBB = InsPt->getParent(); - BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) - .addReg(PhiReg) - .add(*Val) - .addImm(AMDGPU::sub0); - if (UseGPRIdxMode) - BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); + if (UseGPRIdxMode) { + const MCInstrDesc &GPRIDXDesc = + TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); + + BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst) + .addReg(PhiReg) + .add(*Val) + .addReg(SGPRIdxReg) + .addImm(AMDGPU::sub0); + } else { + const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( + TRI.getRegSizeInBits(*VecRC), 32, false); + BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) + .addReg(PhiReg) + .add(*Val) + .addImm(AMDGPU::sub0); + } MI.eraseFromParent(); return LoopBB; @@ -3849,7 +3915,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); - unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) .addReg(CarryReg, RegState::Define) .add(SrcReg0Sub0) @@ -3912,10 +3978,29 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( Src2.setReg(RegOp2); } - if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) { - BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) - .addReg(Src2.getReg()) - .addImm(0); + const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); + if (TRI->getRegSizeInBits(*Src2RC) == 64) { + if (ST.hasScalarCompareEq64()) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) + .addReg(Src2.getReg()) + .addImm(0); + } else { + const TargetRegisterClass *SubRC = + TRI->getSubRegClass(Src2RC, AMDGPU::sub0); + MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( + MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC); + MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( + MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC); + Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32) + .add(Src2Sub0) + .add(Src2Sub1); + + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(Src2_32, RegState::Kill) + .addImm(0); + } } else { BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32)) .addReg(Src2.getReg()) @@ -3936,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } - case AMDGPU::SI_INIT_EXEC: - // This should be before all vector instructions. - BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), - AMDGPU::EXEC) - .addImm(MI.getOperand(0).getImm()); - MI.eraseFromParent(); - return BB; - - case AMDGPU::SI_INIT_EXEC_LO: - // This should be before all vector instructions. - BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), - AMDGPU::EXEC_LO) - .addImm(MI.getOperand(0).getImm()); - MI.eraseFromParent(); - return BB; - - case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { - // Extract the thread count from an SGPR input and set EXEC accordingly. - // Since BFM can't shift by 64, handle that case with CMP + CMOV. - // - // S_BFE_U32 count, input, {shift, 7} - // S_BFM_B64 exec, count, 0 - // S_CMP_EQ_U32 count, 64 - // S_CMOV_B64 exec, -1 - MachineInstr *FirstMI = &*BB->begin(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - Register InputReg = MI.getOperand(0).getReg(); - Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - bool Found = false; - - // Move the COPY of the input reg to the beginning, so that we can use it. - for (auto I = BB->begin(); I != &MI; I++) { - if (I->getOpcode() != TargetOpcode::COPY || - I->getOperand(0).getReg() != InputReg) - continue; - - if (I == FirstMI) { - FirstMI = &*++BB->begin(); - } else { - I->removeFromParent(); - BB->insert(FirstMI, &*I); - } - Found = true; - break; - } - assert(Found); - (void)Found; - - // This should be before all vector instructions. - unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1; - bool isWave32 = getSubtarget()->isWave32(); - unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) - .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); - BuildMI(*BB, FirstMI, DebugLoc(), - TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), - Exec) - .addReg(CountReg) - .addImm(0); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) - .addReg(CountReg, RegState::Kill) - .addImm(getSubtarget()->getWavefrontSize()); - BuildMI(*BB, FirstMI, DebugLoc(), - TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), - Exec) - .addImm(-1); - MI.eraseFromParent(); - return BB; - } - case AMDGPU::GET_GROUPSTATICSIZE: { assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); @@ -4086,13 +4100,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::ADJCALLSTACKDOWN: { const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); MachineInstrBuilder MIB(*MF, &MI); - - // Add an implicit use of the frame offset reg to prevent the restore copy - // inserted after the call from being reorderd after stack operations in the - // the caller's frame. MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) - .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit) - .addReg(Info->getFrameOffsetReg(), RegState::Implicit); + .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); return BB; } case AMDGPU::SI_CALL_ISEL: { @@ -4111,9 +4120,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } - case AMDGPU::V_ADD_I32_e32: - case AMDGPU::V_SUB_I32_e32: - case AMDGPU::V_SUBREV_I32_e32: { + case AMDGPU::V_ADD_CO_U32_e32: + case AMDGPU::V_SUB_CO_U32_e32: + case AMDGPU::V_SUBREV_CO_U32_e32: { // TODO: Define distinct V_*_I32_Pseudo instructions instead. const DebugLoc &DL = MI.getDebugLoc(); unsigned Opc = MI.getOpcode(); @@ -4154,9 +4163,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return emitGWSMemViolTestLoop(MI, BB); case AMDGPU::S_SETREG_B32: { - if (!getSubtarget()->hasDenormModeInst()) - return BB; - // Try to optimize cases that only set the denormal mode or rounding mode. // // If the s_setreg_b32 fully sets all of the bits in the rounding mode or @@ -4166,9 +4172,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // FIXME: This could be predicates on the immediate, but tablegen doesn't // allow you to have a no side effect instruction in the output of a // sideeffecting pattern. - - // TODO: Should also emit a no side effects pseudo if only FP bits are - // touched, even if not all of them or to a variable. unsigned ID, Offset, Width; AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width); if (ID != AMDGPU::Hwreg::ID_MODE) @@ -4176,50 +4179,54 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const unsigned WidthMask = maskTrailingOnes<unsigned>(Width); const unsigned SetMask = WidthMask << Offset; - unsigned SetDenormOp = 0; - unsigned SetRoundOp = 0; - - // The dedicated instructions can only set the whole denorm or round mode at - // once, not a subset of bits in either. - if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | - AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) { - // If this fully sets both the round and denorm mode, emit the two - // dedicated instructions for these. - assert(Offset == 0); - SetRoundOp = AMDGPU::S_ROUND_MODE; - SetDenormOp = AMDGPU::S_DENORM_MODE; - } else if (Width == 4) { - if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) { + + if (getSubtarget()->hasDenormModeInst()) { + unsigned SetDenormOp = 0; + unsigned SetRoundOp = 0; + + // The dedicated instructions can only set the whole denorm or round mode + // at once, not a subset of bits in either. + if (SetMask == + (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) { + // If this fully sets both the round and denorm mode, emit the two + // dedicated instructions for these. SetRoundOp = AMDGPU::S_ROUND_MODE; - assert(Offset == 0); - } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) { SetDenormOp = AMDGPU::S_DENORM_MODE; - assert(Offset == 4); + } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) { + SetRoundOp = AMDGPU::S_ROUND_MODE; + } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) { + SetDenormOp = AMDGPU::S_DENORM_MODE; } - } - if (SetRoundOp || SetDenormOp) { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); - if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { - unsigned ImmVal = Def->getOperand(1).getImm(); - if (SetRoundOp) { - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) - .addImm(ImmVal & 0xf); + if (SetRoundOp || SetDenormOp) { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); + if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { + unsigned ImmVal = Def->getOperand(1).getImm(); + if (SetRoundOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) + .addImm(ImmVal & 0xf); + + // If we also have the denorm mode, get just the denorm mode bits. + ImmVal >>= 4; + } - // If we also have the denorm mode, get just the denorm mode bits. - ImmVal >>= 4; - } + if (SetDenormOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) + .addImm(ImmVal & 0xf); + } - if (SetDenormOp) { - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) - .addImm(ImmVal & 0xf); + MI.eraseFromParent(); + return BB; } - - MI.eraseFromParent(); } } + // If only FP bits are touched, used the no side effects pseudo. + if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | + AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) + MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode)); + return BB; } default: @@ -4256,6 +4263,12 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { return (VT == MVT::i16) ? MVT::i16 : MVT::i32; } +LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const { + return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) + ? Ty.changeElementSize(16) + : Ty.changeElementSize(32); +} + // Answering this is somewhat tricky and depends on the specific device which // have different rates for fma or all f64 operations. // @@ -4457,6 +4470,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMUL: case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: + case ISD::UADDSAT: + case ISD::USUBSAT: + case ISD::SADDSAT: + case ISD::SSUBSAT: return splitBinaryVectorOp(Op, DAG); case ISD::SMULO: case ISD::UMULO: @@ -4467,31 +4484,47 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// Used for D16: Casts the result of an instruction into the right vector, +// packs values if loads return unpacked values. static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked) { if (!LoadVT.isVector()) return Result; + // Cast back to the original packed type or to a larger type that is a + // multiple of 32 bit for D16. Widening the return type is a required for + // legalization. + EVT FittingLoadVT = LoadVT; + if ((LoadVT.getVectorNumElements() % 2) == 1) { + FittingLoadVT = + EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), + LoadVT.getVectorNumElements() + 1); + } + if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16. // Truncate to v2i16/v4i16. - EVT IntLoadVT = LoadVT.changeTypeToInteger(); + EVT IntLoadVT = FittingLoadVT.changeTypeToInteger(); // Workaround legalizer not scalarizing truncate after vector op - // legalization byt not creating intermediate vector trunc. + // legalization but not creating intermediate vector trunc. SmallVector<SDValue, 4> Elts; DAG.ExtractVectorElements(Result, Elts); for (SDValue &Elt : Elts) Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt); + // Pad illegal v1i16/v3fi6 to v4i16 + if ((LoadVT.getVectorNumElements() % 2) == 1) + Elts.push_back(DAG.getUNDEF(MVT::i16)); + Result = DAG.getBuildVector(IntLoadVT, DL, Elts); // Bitcast to original type (v2f16/v4f16). - return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); + return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); } // Cast back to the original packed type. - return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result); + return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result); } SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, @@ -4505,10 +4538,16 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, EVT LoadVT = M->getValueType(0); EVT EquivLoadVT = LoadVT; - if (Unpacked && LoadVT.isVector()) { - EquivLoadVT = LoadVT.isVector() ? - EVT::getVectorVT(*DAG.getContext(), MVT::i32, - LoadVT.getVectorNumElements()) : LoadVT; + if (LoadVT.isVector()) { + if (Unpacked) { + EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, + LoadVT.getVectorNumElements()); + } else if ((LoadVT.getVectorNumElements() % 2) == 1) { + // Widen v3f16 to legal type + EquivLoadVT = + EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(), + LoadVT.getVectorNumElements() + 1); + } } // Change from v4f16/v2f16 to EquivLoadVT. @@ -4519,8 +4558,6 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand()); - if (!Unpacked) // Just adjusted the opcode. - return Load; SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked); @@ -4724,8 +4761,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { if (Res.getOpcode() == ISD::MERGE_VALUES) { // FIXME: Hacky - Results.push_back(Res.getOperand(0)); - Results.push_back(Res.getOperand(1)); + for (unsigned I = 0; I < Res.getNumOperands(); I++) { + Results.push_back(Res.getOperand(I)); + } } else { Results.push_back(Res); Results.push_back(Res.getValue(1)); @@ -4967,7 +5005,7 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); // Get the return address reg and mark it as an implicit live-in - unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent())); + Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent())); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } @@ -5063,7 +5101,7 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + Register UserSGPR = Info->getQueuePtrUserSGPR(); assert(UserSGPR != AMDGPU::NoRegister); SDValue QueuePtr = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); @@ -5136,14 +5174,15 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; - SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset); + SDValue Ptr = + DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset)); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not // be available and how do we get it? MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, - MinAlign(64, StructOffset), + commonAlignment(Align(64), StructOffset), MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } @@ -5504,7 +5543,9 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // variable, but since the encoding of $symbol starts 4 bytes after the start // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to - // compute the correct address. + // compute the correct address. Similarly for the s_addc_u32 instruction, the + // encoding of $symbol starts 12 bytes after the start of the s_add_u32 + // instruction. SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); SDValue PtrHi; @@ -5512,7 +5553,7 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); } else { PtrHi = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1); + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1); } return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } @@ -5521,15 +5562,32 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); + SDLoc DL(GSD); + EVT PtrVT = Op.getValueType(); + const GlobalValue *GV = GSD->getGlobal(); if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && shouldUseLDSConstAddress(GV)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || - GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) + GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + GV->hasExternalLinkage()) { + Type *Ty = GV->getValueType(); + // HIP uses an unsized array `extern __shared__ T s[]` or similar + // zero-sized type in other languages to declare the dynamic shared + // memory which size is not known at the compile time. They will be + // allocated by the runtime and placed directly after the static + // allocated ones. They all share the same offset. + if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) { + assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); + // Adjust alignment for that dynamic shared memory array. + MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV)); + return SDValue( + DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); + } + } return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); - - SDLoc DL(GSD); - EVT PtrVT = Op.getValueType(); + } if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), @@ -5713,7 +5771,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, SDValue Data(Result, 0); SDValue TexFail; - if (IsTexFail) { + if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) { SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); if (MaskPopVT.isVector()) { Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, @@ -5722,10 +5780,6 @@ static SDValue constructRetValue(SelectionDAG &DAG, Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, SDValue(Result, 0), ZeroIdx); } - - TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, - SDValue(Result, 0), - DAG.getConstant(MaskPopDwords, DL, MVT::i32)); } if (DataDwordVT.isVector()) @@ -5735,13 +5789,27 @@ static SDValue constructRetValue(SelectionDAG &DAG, if (IsD16) Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); - if (!ReqRetVT.isVector()) + EVT LegalReqRetVT = ReqRetVT; + if (!ReqRetVT.isVector()) { Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); + } else { + // We need to widen the return vector to a legal type + if ((ReqRetVT.getVectorNumElements() % 2) == 1 && + ReqRetVT.getVectorElementType().getSizeInBits() == 16) { + LegalReqRetVT = + EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(), + ReqRetVT.getVectorNumElements() + 1); + } + } + Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data); - Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data); + if (IsTexFail) { + TexFail = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0), + DAG.getConstant(MaskPopDwords, DL, MVT::i32)); - if (TexFail) return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); + } if (Result->getNumValues() == 1) return Data; @@ -5798,7 +5866,7 @@ static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op, SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, - SelectionDAG &DAG) const { + SelectionDAG &DAG, bool WithChain) const { SDLoc DL(Op); MachineFunction &MF = DAG.getMachineFunction(); const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>(); @@ -5810,10 +5878,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; - bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; + bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); - SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); - SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); + SmallVector<EVT, 3> ResultTypes(Op->values()); + SmallVector<EVT, 3> OrigResultTypes(Op->values()); bool IsD16 = false; bool IsG16 = false; bool IsA16 = false; @@ -5821,7 +5889,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op, int NumVDataDwords; bool AdjustRetType = false; - unsigned AddrIdx; // Index of first address argument + // Offset of intrinsic arguments + const unsigned ArgOffset = WithChain ? 2 : 1; + unsigned DMask; unsigned DMaskLanes = 0; @@ -5839,15 +5909,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op, ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32; DMask = Is64Bit ? 0xf : 0x3; NumVDataDwords = Is64Bit ? 4 : 2; - AddrIdx = 4; } else { DMask = Is64Bit ? 0x3 : 0x1; NumVDataDwords = Is64Bit ? 2 : 1; - AddrIdx = 3; } } else { - unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1; - auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); + auto *DMaskConst = + cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex)); DMask = DMaskConst->getZExtValue(); DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); @@ -5860,7 +5928,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return Op; // D16 is unsupported for this instruction IsD16 = true; - VData = handleD16VData(VData, DAG); + VData = handleD16VData(VData, DAG, true); } NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32; @@ -5880,63 +5948,56 @@ SDValue SITargetLowering::lowerImage(SDValue Op, (!LoadVT.isVector() && DMaskLanes > 1)) return Op; - if (IsD16 && !Subtarget->hasUnpackedD16VMem()) + // The sq block of gfx8 and gfx9 do not estimate register use correctly + // for d16 image_gather4, image_gather4_l, and image_gather4_lz + // instructions. + if (IsD16 && !Subtarget->hasUnpackedD16VMem() && + !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) NumVDataDwords = (DMaskLanes + 1) / 2; else NumVDataDwords = DMaskLanes; AdjustRetType = true; } - - AddrIdx = DMaskIdx + 1; } - unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0; - unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0; - unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0; - unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients + - NumCoords + NumLCM; - unsigned NumMIVAddrs = NumVAddrs; - + unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; SmallVector<SDValue, 4> VAddrs; // Optimize _L to _LZ when _L is zero if (LZMappingInfo) { - if (auto ConstantLod = - dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) { + if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>( + Op.getOperand(ArgOffset + Intr->LodIndex))) { if (ConstantLod->isZero() || ConstantLod->isNegative()) { IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l - NumMIVAddrs--; // remove 'lod' + VAddrEnd--; // remove 'lod' } } } // Optimize _mip away, when 'lod' is zero if (MIPMappingInfo) { - if (auto ConstantLod = - dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) { + if (auto *ConstantLod = dyn_cast<ConstantSDNode>( + Op.getOperand(ArgOffset + Intr->MipIndex))) { if (ConstantLod->isNullValue()) { IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip - NumMIVAddrs--; // remove 'lod' + VAddrEnd--; // remove 'mip' } } } // Push back extra arguments. - for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++) - VAddrs.push_back(Op.getOperand(AddrIdx + I)); + for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) + VAddrs.push_back(Op.getOperand(ArgOffset + I)); // Check for 16 bit addresses or derivatives and pack if true. - unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; - unsigned CoordIdx = DimIdx + NumGradients; - unsigned CoordsEnd = AddrIdx + NumMIVAddrs; - - MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); + MVT VAddrVT = + Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); MVT VAddrScalarVT = VAddrVT.getScalarType(); MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; - VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType(); + VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); VAddrScalarVT = VAddrVT.getScalarType(); IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; if (IsA16 || IsG16) { @@ -5971,17 +6032,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } // Don't compress addresses for G16 - const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx; - packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx, - PackEndIdx, NumGradients); + const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); + packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, + ArgOffset + Intr->GradientStart, PackEndIdx, + Intr->NumGradients); if (!IsA16) { // Add uncompressed address - for (unsigned I = CoordIdx; I < CoordsEnd; I++) + for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) VAddrs.push_back(Op.getOperand(I)); } } else { - for (unsigned I = DimIdx; I < CoordsEnd; I++) + for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++) VAddrs.push_back(Op.getOperand(I)); } @@ -6004,22 +6066,19 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); - unsigned CtrlIdx; // Index of texfailctrl argument SDValue Unorm; if (!BaseOpcode->Sampler) { Unorm = True; - CtrlIdx = AddrIdx + NumVAddrs + 1; } else { auto UnormConst = - cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2)); + cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex)); Unorm = UnormConst->getZExtValue() ? True : False; - CtrlIdx = AddrIdx + NumVAddrs + 3; } SDValue TFE; SDValue LWE; - SDValue TexFail = Op.getOperand(CtrlIdx); + SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex); bool IsTexFail = false; if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail)) return Op; @@ -6066,42 +6125,40 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDValue DLC; if (BaseOpcode->Atomic) { GLC = True; // TODO no-return optimization - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC, - IsGFX10 ? &DLC : nullptr)) + if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex), + DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr)) return Op; } else { - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC, - IsGFX10 ? &DLC : nullptr)) + if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex), + DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr)) return Op; } SmallVector<SDValue, 26> Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) Ops.push_back(VData); // vdata - if (UseNSA) { - for (const SDValue &Addr : VAddrs) - Ops.push_back(Addr); - } else { + if (UseNSA) + append_range(Ops, VAddrs); + else Ops.push_back(VAddr); - } - Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc + Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex)); if (BaseOpcode->Sampler) - Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler + Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex)); Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); - if (IsGFX10) + if (IsGFX10Plus) Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); Ops.push_back(Unorm); - if (IsGFX10) + if (IsGFX10Plus) Ops.push_back(DLC); Ops.push_back(GLC); Ops.push_back(SLC); Ops.push_back(IsA16 && // r128, a16 for gfx9 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); - if (IsGFX10) + if (IsGFX10Plus) Ops.push_back(IsA16 ? True : False); Ops.push_back(TFE); Ops.push_back(LWE); - if (!IsGFX10) + if (!IsGFX10Plus) Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); @@ -6112,7 +6169,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; int Opcode = -1; - if (IsGFX10) { + if (IsGFX10Plus) { Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, UseNSA ? AMDGPU::MIMGEncGfx10NSA : AMDGPU::MIMGEncGfx10Default, @@ -6391,11 +6448,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { - bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; + bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); SDValue GLC; SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, - IsGFX10 ? &DLC : nullptr)) + IsGFX10Plus ? &DLC : nullptr)) return Op; return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), DAG); @@ -6417,11 +6474,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return SDValue(); - DiagnosticInfoUnsupported BadIntrin( - MF.getFunction(), "intrinsic not supported on subtarget", - DL.getDebugLoc()); - DAG.getContext()->diagnose(BadIntrin); - return DAG.getUNDEF(VT); + return emitRemovedIntrinsicError(DAG, DL, VT); } case Intrinsic::amdgcn_ldexp: return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, @@ -6567,7 +6620,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) - return lowerImage(Op, ImageDimIntr, DAG); + return lowerImage(Op, ImageDimIntr, DAG, false); return Op; } @@ -6597,26 +6650,59 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset, cast<ConstantSDNode>(Offset)->getSExtValue(); } -static unsigned getDSShaderTypeValue(const MachineFunction &MF) { - switch (MF.getFunction().getCallingConv()) { - case CallingConv::AMDGPU_PS: - return 1; - case CallingConv::AMDGPU_VS: - return 2; - case CallingConv::AMDGPU_GS: - return 3; - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_LS: - case CallingConv::AMDGPU_ES: - report_fatal_error("ds_ordered_count unsupported for this calling conv"); - case CallingConv::AMDGPU_CS: - case CallingConv::AMDGPU_KERNEL: - case CallingConv::C: - case CallingConv::Fast: - default: - // Assume other calling conventions are various compute callable functions - return 0; - } +SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op, + SelectionDAG &DAG, + unsigned NewOpcode) const { + SDLoc DL(Op); + + SDValue VData = Op.getOperand(2); + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + VData, // vdata + Op.getOperand(3), // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + Offsets.first, // voffset + Op.getOperand(5), // soffset + Offsets.second, // offset + Op.getOperand(6), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen + }; + + auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); + + EVT MemVT = VData.getValueType(); + return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, + M->getMemOperand()); +} + +SDValue +SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG, + unsigned NewOpcode) const { + SDLoc DL(Op); + + SDValue VData = Op.getOperand(2); + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); + SDValue Ops[] = { + Op.getOperand(0), // Chain + VData, // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + Offsets.first, // voffset + Op.getOperand(6), // soffset + Offsets.second, // offset + Op.getOperand(7), // cachepolicy + DAG.getTargetConstant(1, DL, MVT::i1), // idxen + }; + + auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], + Ops[3])); + + EVT MemVT = VData.getValueType(); + return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, + M->getMemOperand()); } SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, @@ -6656,7 +6742,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, report_fatal_error("ds_ordered_count: wave_done requires wave_release"); unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; - unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction()); + unsigned ShaderType = + SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction()); unsigned Offset0 = OrderedCountIndex << 2; unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | (Instruction << 4); @@ -6893,7 +6980,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_umax: case Intrinsic::amdgcn_buffer_atomic_and: case Intrinsic::amdgcn_buffer_atomic_or: - case Intrinsic::amdgcn_buffer_atomic_xor: { + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_fadd: { unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); unsigned IdxEn = 1; if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) @@ -6953,6 +7041,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_buffer_atomic_fadd: + if (!Op.getValue(0).use_empty()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } + Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6960,155 +7059,64 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD); case Intrinsic::amdgcn_raw_buffer_atomic_swap: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_raw_buffer_atomic_add: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); case Intrinsic::amdgcn_raw_buffer_atomic_sub: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); case Intrinsic::amdgcn_raw_buffer_atomic_smin: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN); case Intrinsic::amdgcn_raw_buffer_atomic_umin: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN); case Intrinsic::amdgcn_raw_buffer_atomic_smax: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX); case Intrinsic::amdgcn_raw_buffer_atomic_umax: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX); case Intrinsic::amdgcn_raw_buffer_atomic_and: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); case Intrinsic::amdgcn_raw_buffer_atomic_or: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); case Intrinsic::amdgcn_raw_buffer_atomic_xor: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); case Intrinsic::amdgcn_raw_buffer_atomic_inc: - case Intrinsic::amdgcn_raw_buffer_atomic_dec: { - auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - Offsets.first, // voffset - Op.getOperand(5), // soffset - Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getTargetConstant(0, DL, MVT::i1), // idxen - }; - EVT VT = Op.getValueType(); - - auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); - unsigned Opcode = 0; - - switch (IntrID) { - case Intrinsic::amdgcn_raw_buffer_atomic_swap: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_add: - Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_sub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_smin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_umin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_smax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_umax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_and: - Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_or: - Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_xor: - Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_inc: - Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; - break; - case Intrinsic::amdgcn_raw_buffer_atomic_dec: - Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; - break; - default: - llvm_unreachable("unhandled atomic opcode"); - } - - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); - } + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); case Intrinsic::amdgcn_struct_buffer_atomic_swap: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_SWAP); case Intrinsic::amdgcn_struct_buffer_atomic_add: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD); case Intrinsic::amdgcn_struct_buffer_atomic_sub: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB); case Intrinsic::amdgcn_struct_buffer_atomic_smin: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_SMIN); case Intrinsic::amdgcn_struct_buffer_atomic_umin: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_UMIN); case Intrinsic::amdgcn_struct_buffer_atomic_smax: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_SMAX); case Intrinsic::amdgcn_struct_buffer_atomic_umax: + return lowerStructBufferAtomicIntrin(Op, DAG, + AMDGPUISD::BUFFER_ATOMIC_UMAX); case Intrinsic::amdgcn_struct_buffer_atomic_and: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND); case Intrinsic::amdgcn_struct_buffer_atomic_or: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR); case Intrinsic::amdgcn_struct_buffer_atomic_xor: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR); case Intrinsic::amdgcn_struct_buffer_atomic_inc: - case Intrinsic::amdgcn_struct_buffer_atomic_dec: { - auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); - SDValue Ops[] = { - Op.getOperand(0), // Chain - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - Offsets.first, // voffset - Op.getOperand(6), // soffset - Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getTargetConstant(1, DL, MVT::i1), // idxen - }; - EVT VT = Op.getValueType(); - - auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], - Ops[3])); - unsigned Opcode = 0; - - switch (IntrID) { - case Intrinsic::amdgcn_struct_buffer_atomic_swap: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_add: - Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_sub: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_smin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_umin: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_smax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_umax: - Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_and: - Opcode = AMDGPUISD::BUFFER_ATOMIC_AND; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_or: - Opcode = AMDGPUISD::BUFFER_ATOMIC_OR; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_xor: - Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_inc: - Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; - break; - case Intrinsic::amdgcn_struct_buffer_atomic_dec: - Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; - break; - default: - llvm_unreachable("unhandled atomic opcode"); - } + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC); + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC); - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); - } case Intrinsic::amdgcn_buffer_atomic_cmpswap: { unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue(); unsigned IdxEn = 1; @@ -7180,7 +7188,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_global_atomic_csub: { + case Intrinsic::amdgcn_global_atomic_fadd: { + if (!Op.getValue(0).use_empty()) { + DiagnosticInfoUnsupported + NoFpRet(DAG.getMachineFunction().getFunction(), + "return versions of fp atomics not supported", + DL.getDebugLoc(), DS_Error); + DAG.getContext()->diagnose(NoFpRet); + return SDValue(); + } MemSDNode *M = cast<MemSDNode>(Op); SDValue Ops[] = { M->getOperand(0), // Chain @@ -7188,15 +7204,85 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, M->getOperand(3) // Value }; - return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op), - M->getVTList(), Ops, M->getMemoryVT(), - M->getMemOperand()); + EVT VT = Op.getOperand(3).getValueType(); + return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, + DAG.getVTList(VT, MVT::Other), Ops, + M->getMemOperand()); } + case Intrinsic::amdgcn_image_bvh_intersect_ray: { + SDLoc DL(Op); + MemSDNode *M = cast<MemSDNode>(Op); + SDValue NodePtr = M->getOperand(2); + SDValue RayExtent = M->getOperand(3); + SDValue RayOrigin = M->getOperand(4); + SDValue RayDir = M->getOperand(5); + SDValue RayInvDir = M->getOperand(6); + SDValue TDescr = M->getOperand(7); + + assert(NodePtr.getValueType() == MVT::i32 || + NodePtr.getValueType() == MVT::i64); + assert(RayDir.getValueType() == MVT::v4f16 || + RayDir.getValueType() == MVT::v4f32); + + bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; + bool Is64 = NodePtr.getValueType() == MVT::i64; + unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + + SmallVector<SDValue, 16> Ops; + + auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) { + SmallVector<SDValue, 3> Lanes; + DAG.ExtractVectorElements(Op, Lanes, 0, 3); + if (Lanes[0].getValueSizeInBits() == 32) { + for (unsigned I = 0; I < 3; ++I) + Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I])); + } else { + if (IsAligned) { + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Lanes[0], Lanes[1] }))); + Ops.push_back(Lanes[2]); + } else { + SDValue Elt0 = Ops.pop_back_val(); + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Elt0, Lanes[0] }))); + Ops.push_back( + DAG.getBitcast(MVT::i32, + DAG.getBuildVector(MVT::v2f16, DL, + { Lanes[1], Lanes[2] }))); + } + } + }; + if (Is64) + DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2); + else + Ops.push_back(NodePtr); + + Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent)); + packLanes(RayOrigin, true); + packLanes(RayDir, true); + packLanes(RayInvDir, false); + Ops.push_back(TDescr); + if (IsA16) + Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1)); + Ops.push_back(M->getChain()); + + auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); + MachineMemOperand *MemRef = M->getMemOperand(); + DAG.setNodeMemRefs(NewNode, {MemRef}); + return SDValue(NewNode, 0); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) - return lowerImage(Op, ImageDimIntr, DAG); + return lowerImage(Op, ImageDimIntr, DAG, true); return SDValue(); } @@ -7234,8 +7320,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, return NewOp; } -SDValue SITargetLowering::handleD16VData(SDValue VData, - SelectionDAG &DAG) const { +SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG, + bool ImageStore) const { EVT StoreVT = VData.getValueType(); // No change for f16 and legal vector D16 types. @@ -7243,19 +7329,70 @@ SDValue SITargetLowering::handleD16VData(SDValue VData, return VData; SDLoc DL(VData); - assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16"); + unsigned NumElements = StoreVT.getVectorNumElements(); if (Subtarget->hasUnpackedD16VMem()) { // We need to unpack the packed data to store. EVT IntStoreVT = StoreVT.changeTypeToInteger(); SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); - EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, - StoreVT.getVectorNumElements()); + EVT EquivStoreVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements); SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); return DAG.UnrollVectorOp(ZExt.getNode()); } + // The sq block of gfx8.1 does not estimate register use correctly for d16 + // image store instructions. The data operand is computed as if it were not a + // d16 image instruction. + if (ImageStore && Subtarget->hasImageStoreD16Bug()) { + // Bitcast to i16 + EVT IntStoreVT = StoreVT.changeTypeToInteger(); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + // Decompose into scalars + SmallVector<SDValue, 4> Elts; + DAG.ExtractVectorElements(IntVData, Elts); + + // Group pairs of i16 into v2i16 and bitcast to i32 + SmallVector<SDValue, 4> PackedElts; + for (unsigned I = 0; I < Elts.size() / 2; I += 1) { + SDValue Pair = + DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]}); + SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); + PackedElts.push_back(IntPair); + } + if ((NumElements % 2) == 1) { + // Handle v3i16 + unsigned I = Elts.size() / 2; + SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL, + {Elts[I * 2], DAG.getUNDEF(MVT::i16)}); + SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); + PackedElts.push_back(IntPair); + } + + // Pad using UNDEF + PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32)); + + // Build final vector + EVT VecVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size()); + return DAG.getBuildVector(VecVT, DL, PackedElts); + } + + if (NumElements == 3) { + EVT IntStoreVT = + EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits()); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT WidenedStoreVT = EVT::getVectorVT( + *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1); + EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(), + WidenedStoreVT.getStoreSizeInBits()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData); + return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); + } + assert(isTypeLegal(StoreVT)); return VData; } @@ -7433,8 +7570,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, EVT VDataVT = VData.getValueType(); EVT EltType = VDataVT.getScalarType(); bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); - if (IsD16) + if (IsD16) { VData = handleD16VData(VData, DAG); + VDataVT = VData.getValueType(); + } if (!isTypeLegal(VDataVT)) { VData = @@ -7478,8 +7617,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, EVT EltType = VDataVT.getScalarType(); bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); - if (IsD16) + if (IsD16) { VData = handleD16VData(VData, DAG); + VDataVT = VData.getValueType(); + } if (!isTypeLegal(VDataVT)) { VData = @@ -7514,57 +7655,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } - - case Intrinsic::amdgcn_buffer_atomic_fadd: { - unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); - unsigned IdxEn = 1; - if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) - IdxEn = Idx->getZExtValue() != 0; - SDValue Ops[] = { - Chain, - Op.getOperand(2), // vdata - Op.getOperand(3), // rsrc - Op.getOperand(4), // vindex - SDValue(), // voffset -- will be set by setBufferOffsets - SDValue(), // soffset -- will be set by setBufferOffsets - SDValue(), // offset -- will be set by setBufferOffsets - DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen - }; - unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); - // We don't know the offset if vindex is non-zero, so clear it. - if (IdxEn) - Offset = 0; - EVT VT = Op.getOperand(2).getValueType(); - - auto *M = cast<MemSDNode>(Op); - M->getMemOperand()->setOffset(Offset); - unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD - : AMDGPUISD::BUFFER_ATOMIC_FADD; - - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); - } - - case Intrinsic::amdgcn_global_atomic_fadd: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), // ptr - Op.getOperand(3) // vdata - }; - EVT VT = Op.getOperand(3).getValueType(); - - auto *M = cast<MemSDNode>(Op); - if (VT.isVector()) { - return DAG.getMemIntrinsicNode( - AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); - } - - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()).getValue(1); - } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); @@ -7572,7 +7662,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) - return lowerImage(Op, ImageDimIntr, DAG); + return lowerImage(Op, ImageDimIntr, DAG, true); return Op; } @@ -7848,13 +7938,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - MemVT, *Load->getMemOperand())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, DL); - } - unsigned Alignment = Load->getAlignment(); unsigned AS = Load->getAddressSpace(); if (Subtarget->hasLDSMisalignedBug() && @@ -7879,9 +7962,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) { if (MemVT.isPow2VectorType()) return SDValue(); - if (NumElements == 3) - return WidenVectorLoad(Op, DAG); - return SplitVectorLoad(Op, DAG); + return WidenOrSplitVectorLoad(Op, DAG); } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private @@ -7897,9 +7978,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { Alignment >= 4 && NumElements < 32) { if (MemVT.isPow2VectorType()) return SDValue(); - if (NumElements == 3) - return WidenVectorLoad(Op, DAG); - return SplitVectorLoad(Op, DAG); + return WidenOrSplitVectorLoad(Op, DAG); } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private @@ -7914,7 +7993,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return SplitVectorLoad(Op, DAG); // v3 loads not supported on SI. if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) - return WidenVectorLoad(Op, DAG); + return WidenOrSplitVectorLoad(Op, DAG); + // v3 and v4 loads are supported for private and global memory. return SDValue(); } @@ -7938,15 +8018,19 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return SplitVectorLoad(Op, DAG); // v3 loads not supported on SI. if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) - return WidenVectorLoad(Op, DAG); + return WidenOrSplitVectorLoad(Op, DAG); + return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - // Use ds_read_b128 if possible. - if (Subtarget->useDS128() && Load->getAlignment() >= 16 && - MemVT.getStoreSize() == 16) + // Use ds_read_b128 or ds_read_b96 when possible. + if (Subtarget->hasDS96AndDS128() && + ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) || + MemVT.getStoreSize() == 12) && + allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS, + Load->getAlign())) return SDValue(); if (NumElements > 2) @@ -7963,6 +8047,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { return SplitVectorLoad(Op, DAG); } } + + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + return SDValue(); } @@ -8003,8 +8095,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath || - Flags.hasApproximateFuncs(); + bool AllowInaccurateRcp = Flags.hasApproximateFuncs(); // Without !fpmath accuracy information, we can't do more because we don't // know exactly whether rcp is accurate enough to meet !fpmath requirement. @@ -8045,6 +8136,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); } +SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + EVT VT = Op.getValueType(); + const SDNodeFlags Flags = Op->getFlags(); + + bool AllowInaccurateDiv = Flags.hasApproximateFuncs() || + DAG.getTarget().Options.UnsafeFPMath; + if (!AllowInaccurateDiv) + return SDValue(); + + SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y); + SDValue One = DAG.getConstantFP(1.0, SL, VT); + + SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y); + SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); + + R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R); + SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One); + R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R); + SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R); + SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X); + return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret); +} + static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags) { @@ -8273,8 +8391,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getTarget().Options.UnsafeFPMath) - return lowerFastUnsafeFDIV(Op, DAG); + if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG)) + return FastLowered; SDLoc SL(Op); SDValue X = Op.getOperand(0); @@ -8368,11 +8486,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - VT, *Store->getMemOperand())) { - return expandUnalignedStore(Store, DAG); - } - unsigned AS = Store->getAddressSpace(); if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && @@ -8397,6 +8510,11 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // v3 stores not supported on SI. if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) return SplitVectorStore(Op, DAG); + + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) + return expandUnalignedStore(Store, DAG); + return SDValue(); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { @@ -8407,16 +8525,20 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SplitVectorStore(Op, DAG); return SDValue(); case 16: - if (NumElements > 4 || NumElements == 3) + if (NumElements > 4 || + (NumElements == 3 && !Subtarget->enableFlatScratch())) return SplitVectorStore(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - // Use ds_write_b128 if possible. - if (Subtarget->useDS128() && Store->getAlignment() >= 16 && - VT.getStoreSize() == 16 && NumElements != 3) + // Use ds_write_b128 or ds_write_b96 when possible. + if (Subtarget->hasDS96AndDS128() && + ((Subtarget->useDS128() && VT.getStoreSize() == 16) || + (VT.getStoreSize() == 12)) && + allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS, + Store->getAlign())) return SDValue(); if (NumElements > 2) @@ -8433,6 +8555,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SplitVectorStore(Op, DAG); } + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) { + if (VT.isVector()) + return SplitVectorStore(Op, DAG); + return expandUnalignedStore(Store, DAG); + } + return SDValue(); } else { llvm_unreachable("unhandled address space"); @@ -8474,7 +8603,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co unsigned AS = AtomicNode->getAddressSpace(); // No custom lowering required for local address space - if (!isFlatGlobalAddrSpace(AS)) + if (!AMDGPU::isFlatGlobalAddrSpace(AS)) return Op; // Non-local address space requires custom lowering for atomic compare @@ -8584,7 +8713,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, EVT VT = N->getValueType(0); SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1); - SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32); + SDValue COffset = DAG.getConstant(Offset, SL, VT); SDNodeFlags Flags; Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && @@ -8594,12 +8723,28 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags); } +/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset +/// by the chain and intrinsic ID. Theoretically we would also need to check the +/// specific intrinsic, but they all place the pointer operand first. +static unsigned getBasePtrIndex(const MemSDNode *N) { + switch (N->getOpcode()) { + case ISD::STORE: + case ISD::INTRINSIC_W_CHAIN: + case ISD::INTRINSIC_VOID: + return 2; + default: + return 1; + } +} + SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const { - SDValue Ptr = N->getBasePtr(); SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); + unsigned PtrIdx = getBasePtrIndex(N); + SDValue Ptr = N->getOperand(PtrIdx); + // TODO: We could also do this for multiplies. if (Ptr.getOpcode() == ISD::SHL) { SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), @@ -8607,7 +8752,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N, if (NewPtr) { SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); - NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr; + NewOps[PtrIdx] = NewPtr; return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } } @@ -8868,7 +9013,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && - N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { uint32_t LHSMask = getPermuteMask(DAG, LHS); uint32_t RHSMask = getPermuteMask(DAG, RHS); if (LHSMask != ~0u && RHSMask != ~0u) { @@ -8965,7 +9110,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() && - N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) { + N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { uint32_t LHSMask = getPermuteMask(DAG, LHS); uint32_t RHSMask = getPermuteMask(DAG, RHS); if (LHSMask != ~0u && RHSMask != ~0u) { @@ -10509,8 +10654,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return SDValue(); switch (N->getOpcode()) { - default: - return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::ADD: return performAddCombine(N, DCI); case ISD::SUB: @@ -10537,35 +10680,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performMinMaxCombine(N, DCI); case ISD::FMA: return performFMACombine(N, DCI); - case ISD::LOAD: { - if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI)) - return Widended; - LLVM_FALLTHROUGH; - } - case ISD::STORE: - case ISD::ATOMIC_LOAD: - case ISD::ATOMIC_STORE: - case ISD::ATOMIC_CMP_SWAP: - case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: - case ISD::ATOMIC_SWAP: - case ISD::ATOMIC_LOAD_ADD: - case ISD::ATOMIC_LOAD_SUB: - case ISD::ATOMIC_LOAD_AND: - case ISD::ATOMIC_LOAD_OR: - case ISD::ATOMIC_LOAD_XOR: - case ISD::ATOMIC_LOAD_NAND: - case ISD::ATOMIC_LOAD_MIN: - case ISD::ATOMIC_LOAD_MAX: - case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: - case ISD::ATOMIC_LOAD_FADD: - case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: - case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. - if (DCI.isBeforeLegalize()) - break; - return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); case ISD::AND: return performAndCombine(N, DCI); case ISD::OR: @@ -10630,14 +10744,28 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performExtractVectorEltCombine(N, DCI); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); + case ISD::LOAD: { + if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI)) + return Widended; + LLVM_FALLTHROUGH; + } + default: { + if (!DCI.isBeforeLegalize()) { + if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N)) + return performMemSDNodeCombine(MemNode, DCI); + } + + break; } + } + return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } /// Helper function for adjustWritemask static unsigned SubIdx2Lane(unsigned Idx) { switch (Idx) { - default: return 0; + default: return ~0u; case AMDGPU::sub0: return 0; case AMDGPU::sub1: return 1; case AMDGPU::sub2: return 2; @@ -10697,6 +10825,8 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit // set, etc. Lane = SubIdx2Lane(I->getConstantOperandVal(1)); + if (Lane == ~0u) + return Node; // Check if the use is for the TFE/LWE generated result at VGPRn+1. if (UsesTFC && Lane == TFCLane) { @@ -10826,8 +10956,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have // to try understanding copies to physical registers. - if (SrcVal.getValueType() == MVT::i1 && - Register::isPhysicalRegister(DestReg->getReg())) { + if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) { SDLoc SL(Node); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue VReg = DAG.getRegister( @@ -10870,7 +10999,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, unsigned Opcode = Node->getMachineOpcode(); if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && - !TII->isGather4(Opcode)) { + !TII->isGather4(Opcode) && + AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) != -1) { return adjustWritemask(Node, DAG); } @@ -10881,14 +11011,14 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, } switch (Opcode) { - case AMDGPU::V_DIV_SCALE_F32: - case AMDGPU::V_DIV_SCALE_F64: { + case AMDGPU::V_DIV_SCALE_F32_e64: + case AMDGPU::V_DIV_SCALE_F64_e64: { // Satisfy the operand register constraint when one of the inputs is // undefined. Ordinarily each undef value will have its own implicit_def of // a vreg, so force these to use a single register. - SDValue Src0 = Node->getOperand(0); - SDValue Src1 = Node->getOperand(1); - SDValue Src2 = Node->getOperand(2); + SDValue Src0 = Node->getOperand(1); + SDValue Src1 = Node->getOperand(3); + SDValue Src2 = Node->getOperand(5); if ((Src0.isMachineOpcode() && Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) && @@ -10923,10 +11053,10 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, } else break; - SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 }; - for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I) - Ops.push_back(Node->getOperand(I)); - + SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end()); + Ops[1] = Src0; + Ops[3] = Src1; + Ops[5] = Src2; Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } @@ -10962,8 +11092,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineOperand &Op = MI.getOperand(I); if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || - !Register::isVirtualRegister(Op.getReg()) || - !TRI->isAGPR(MRI, Op.getReg())) + !Op.getReg().isVirtual() || !TRI->isAGPR(MRI, Op.getReg())) continue; auto *Src = MRI.getUniqueVRegDef(Op.getReg()); if (!Src || !Src->isCopy() || @@ -10985,8 +11114,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { if (!Node->hasAnyUseOfValue(0)) { - MI.setDesc(TII->get(NoRetAtomicOp)); + int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::glc1); + if (Glc1Idx != -1) + MI.RemoveOperand(Glc1Idx); MI.RemoveOperand(0); + MI.setDesc(TII->get(NoRetAtomicOp)); return; } @@ -11341,17 +11474,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { - // Add VCC_HI def because many instructions marked as imp-use VCC where - // we may only define VCC_LO. If nothing defines VCC_HI we may end up - // having a use of undef. - const SIInstrInfo *TII = ST.getInstrInfo(); - DebugLoc DL; - - MachineBasicBlock &MBB = MF.front(); - MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr(); - BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI); - for (auto &MBB : MF) { for (auto &MI : MBB) { TII->fixImplicitOperands(MI); @@ -11379,6 +11502,55 @@ void SITargetLowering::computeKnownBitsForFrameIndex( Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } +static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, + KnownBits &Known, unsigned Dim) { + unsigned MaxValue = + ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim); + Known.Zero.setHighBits(countLeadingZeros(MaxValue)); +} + +void SITargetLowering::computeKnownBitsForTargetInstr( + GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts, + const MachineRegisterInfo &MRI, unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + switch (MI->getOpcode()) { + case AMDGPU::G_INTRINSIC: { + switch (MI->getIntrinsicID()) { + case Intrinsic::amdgcn_workitem_id_x: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0); + break; + case Intrinsic::amdgcn_workitem_id_y: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1); + break; + case Intrinsic::amdgcn_workitem_id_z: + knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2); + break; + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::amdgcn_mbcnt_hi: { + // These return at most the wavefront size - 1. + unsigned Size = MRI.getType(R).getSizeInBits(); + Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2()); + break; + } + case Intrinsic::amdgcn_groupstaticsize: { + // We can report everything over the maximum size as 0. We can't report + // based on the actual size because we don't know if it's accurate or not + // at any given point. + Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize())); + break; + } + } + break; + } + case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: + Known.Zero.setHighBits(24); + break; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: + Known.Zero.setHighBits(16); + break; + } +} + Align SITargetLowering::computeKnownAlignForTargetInstr( GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, unsigned Depth) const { @@ -11484,46 +11656,40 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) { return false; } -bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, - FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const -{ +bool SITargetLowering::isSDNodeSourceOfDivergence( + const SDNode *N, FunctionLoweringInfo *FLI, + LegacyDivergenceAnalysis *KDA) const { switch (N->getOpcode()) { - case ISD::CopyFromReg: - { - const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); - const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); - const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - Register Reg = R->getReg(); + case ISD::CopyFromReg: { + const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); + const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + Register Reg = R->getReg(); - // FIXME: Why does this need to consider isLiveIn? - if (Reg.isPhysical() || MRI.isLiveIn(Reg)) - return !TRI->isSGPRReg(MRI, Reg); + // FIXME: Why does this need to consider isLiveIn? + if (Reg.isPhysical() || MRI.isLiveIn(Reg)) + return !TRI->isSGPRReg(MRI, Reg); - if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) - return KDA->isDivergent(V); + if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) + return KDA->isDivergent(V); - assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); - return !TRI->isSGPRReg(MRI, Reg); - } - break; - case ISD::LOAD: { - const LoadSDNode *L = cast<LoadSDNode>(N); - unsigned AS = L->getAddressSpace(); - // A flat load may access private memory. - return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; - } break; - case ISD::CALLSEQ_END: + assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); + return !TRI->isSGPRReg(MRI, Reg); + } + case ISD::LOAD: { + const LoadSDNode *L = cast<LoadSDNode>(N); + unsigned AS = L->getAddressSpace(); + // A flat load may access private memory. + return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS; + } + case ISD::CALLSEQ_END: return true; - break; - case ISD::INTRINSIC_WO_CHAIN: - { - - } - return AMDGPU::isIntrinsicSourceOfDivergence( - cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()); - case ISD::INTRINSIC_W_CHAIN: - return AMDGPU::isIntrinsicSourceOfDivergence( - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); + case ISD::INTRINSIC_WO_CHAIN: + return AMDGPU::isIntrinsicSourceOfDivergence( + cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()); + case ISD::INTRINSIC_W_CHAIN: + return AMDGPU::isIntrinsicSourceOfDivergence( + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); } return false; } @@ -11558,6 +11724,16 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, SNaN, Depth); } +// Global FP atomic instructions have a hardcoded FP mode and do not support +// FP32 denormals, and only support v2f16 denormals. +static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { + const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); + auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt); + if (&Flt == &APFloat::IEEEsingle()) + return DenormMode == DenormalMode::getPreserveSign(); + return DenormMode == DenormalMode::getIEEE(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { switch (RMW->getOperation()) { @@ -11576,10 +11752,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) { + if (!fpModeMatchesGlobalFPAtomicMode(RMW)) + return AtomicExpansionKind::CmpXChg; + return RMW->use_empty() ? AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; } + // DS FP atomics do repect the denormal mode, but the rounding mode is fixed + // to round-to-nearest-even. return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; } |