aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-06-13 19:31:46 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-07-31 18:56:55 +0000
commitaf732203b8f7f006927528db5497f5cbc4c4742a (patch)
tree596f112de3b76118552871dbb6114bb7e3e17f40 /contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
parent83dea422ac8d4a8323e64203c2eadaa813768717 (diff)
downloadsrc-af732203b8f7f006927528db5497f5cbc4c4742a.tar.gz
src-af732203b8f7f006927528db5497f5cbc4c4742a.zip
Merge llvm-project 12.0.1 release and follow-up fixes
Merge llvm-project main llvmorg-12-init-17869-g8e464dd76bef This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12-init-17869-g8e464dd76bef, the last commit before the upstream release/12.x branch was created. PR: 255570 (cherry picked from commit e8d8bef961a50d4dc22501cde4fb9fb0be1b2532) Merge llvm-project 12.0.0 release This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12.0.0-0-gd28af7c654d8, a.k.a. 12.0.0 release. PR: 255570 (cherry picked from commit d409305fa3838fb39b38c26fc085fb729b8766d5) Disable strict-fp for powerpcspe, as it does not work properly yet Merge commit 5c18d1136665 from llvm git (by Qiu Chaofan) [SPE] Disable strict-fp for SPE by default As discussed in PR50385, strict-fp on PowerPC SPE has not been handled well. This patch disables it by default for SPE. Reviewed By: nemanjai, vit9696, jhibbits Differential Revision: https://reviews.llvm.org/D103235 PR: 255570 (cherry picked from commit 715df83abc049b23d9acddc81f2480bd4c056d64) Apply upstream libc++ fix to allow building with devel/xxx-xtoolchain-gcc Merge commit 52e9d80d5db2 from llvm git (by Jason Liu): [libc++] add `inline` for __open's definition in ifstream and ofstream Summary: When building with gcc on AIX, it seems that gcc does not like the `always_inline` without the `inline` keyword. So adding the inline keywords in for __open in ifstream and ofstream. That will also make it consistent with __open in basic_filebuf (it seems we added `inline` there before for gcc build as well). Differential Revision: https://reviews.llvm.org/D99422 PR: 255570 (cherry picked from commit d099db25464b826c5724cf2fb5b22292bbe15f6e) Undefine HAVE_(DE)REGISTER_FRAME in llvm's config.h on arm Otherwise, the lli tool (enable by WITH_CLANG_EXTRAS) won't link on arm, stating that __register_frame is undefined. This function is normally provided by libunwind, but explicitly not for the ARM Exception ABI. Reported by: oh PR: 255570 (cherry picked from commit f336b45e943c7f9a90ffcea1a6c4c7039e54c73c) Merge llvm-project 12.0.1 rc2 This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12.0.1-rc2-0-ge7dac564cd0e, a.k.a. 12.0.1 rc2. PR: 255570 (cherry picked from commit 23408297fbf3089f0388a8873b02fa75ab3f5bb9) Revert libunwind change to fix backtrace segfault on aarch64 Revert commit 22b615a96593 from llvm git (by Daniel Kiss): [libunwind] Support for leaf function unwinding. Unwinding leaf function is useful in cases when the backtrace finds a leaf function for example when it caused a signal. This patch also add the support for the DW_CFA_undefined because it marks the end of the frames. Ryan Prichard provided code for the tests. Reviewed By: #libunwind, mstorsjo Differential Revision: https://reviews.llvm.org/D83573 Reland with limit the test to the x86_64-linux target. Bisection has shown that this particular upstream commit causes programs using backtrace(3) on aarch64 to segfault. This affects the lang/rust port, for instance. Until we can upstream to fix this problem, revert the commit for now. Reported by: mikael PR: 256864 (cherry picked from commit 5866c369e4fd917c0d456f0f10b92ee354b82279) Merge llvm-project 12.0.1 release This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12.0.1-0-gfed41342a82f, a.k.a. 12.0.1 release. PR: 255570 (cherry picked from commit 4652422eb477731f284b1345afeefef7f269da50) compilert-rt: build out-of-line LSE atomics helpers for aarch64 Both clang >= 12 and gcc >= 10.1 now default to -moutline-atomics for aarch64. This requires a bunch of helper functions in libcompiler_rt.a, to avoid link errors like "undefined symbol: __aarch64_ldadd8_acq_rel". (Note: of course you can use -mno-outline-atomics as a workaround too, but this would negate the potential performance benefit of the faster LSE instructions.) Bump __FreeBSD_version so ports maintainers can easily detect this. PR: 257392 (cherry picked from commit cc55ee8009a550810d38777fd6ace9abf3a2f6b4)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp2013
1 files changed, 1097 insertions, 916 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d035aa8f72bd..839437b5e3f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13,72 +13,21 @@
#include "SIISelLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
+#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/DAGCombine.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/TargetCallingConv.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetOptions.h"
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <iterator>
-#include <tuple>
-#include <utility>
-#include <vector>
using namespace llvm;
@@ -449,6 +398,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::FPOW, MVT::f16, Promote);
+ setOperationAction(ISD::FPOWI, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::f16, Custom);
setOperationAction(ISD::FEXP, MVT::f16, Custom);
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
@@ -486,6 +436,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasBFE())
setHasExtractBitsInsn(true);
+ // Clamp modifier on add/sub
+ if (Subtarget->hasIntClamp()) {
+ setOperationAction(ISD::UADDSAT, MVT::i32, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::i32, Legal);
+ }
+
+ if (Subtarget->hasAddNoCarry()) {
+ setOperationAction(ISD::SADDSAT, MVT::i16, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::i16, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::i32, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::i32, Legal);
+ }
+
setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
@@ -531,13 +494,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
- setOperationAction(ISD::ROTR, MVT::i16, Promote);
- setOperationAction(ISD::ROTL, MVT::i16, Promote);
+ setOperationAction(ISD::ROTR, MVT::i16, Expand);
+ setOperationAction(ISD::ROTL, MVT::i16, Expand);
setOperationAction(ISD::SDIV, MVT::i16, Promote);
setOperationAction(ISD::UDIV, MVT::i16, Promote);
setOperationAction(ISD::SREM, MVT::i16, Promote);
setOperationAction(ISD::UREM, MVT::i16, Promote);
+ setOperationAction(ISD::UADDSAT, MVT::i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::i16, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
@@ -702,6 +667,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
+ setOperationAction(ISD::UADDSAT, MVT::v2i16, Legal);
+ setOperationAction(ISD::USUBSAT, MVT::v2i16, Legal);
+ setOperationAction(ISD::SADDSAT, MVT::v2i16, Legal);
+ setOperationAction(ISD::SSUBSAT, MVT::v2i16, Legal);
+
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
@@ -729,6 +699,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+ setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom);
+ setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom);
+
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
setOperationAction(ISD::FMA, MVT::v4f16, Custom);
@@ -779,6 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v3i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
@@ -790,6 +767,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v3i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v3f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
@@ -844,6 +823,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
// FIXME: In other contexts we pretend this is a per-function property.
setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
@@ -888,15 +869,18 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
if (VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
- if (Size == 32)
- return ScalarVT.getSimpleVT();
+ if (Size == 16) {
+ if (Subtarget->has16BitInsts())
+ return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ return VT.isInteger() ? MVT::i32 : MVT::f32;
+ }
- if (Size > 32)
- return MVT::i32;
+ if (Size < 16)
+ return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
+ return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
+ }
- if (Size == 16 && Subtarget->has16BitInsts())
- return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
- } else if (VT.getSizeInBits() > 32)
+ if (VT.getSizeInBits() > 32)
return MVT::i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
@@ -913,14 +897,15 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
- if (Size == 32)
+ // FIXME: Should probably promote 8-bit vectors to i16.
+ if (Size == 16 && Subtarget->has16BitInsts())
+ return (NumElts + 1) / 2;
+
+ if (Size <= 32)
return NumElts;
if (Size > 32)
return NumElts * ((Size + 31) / 32);
-
- if (Size == 16 && Subtarget->has16BitInsts())
- return (NumElts + 1) / 2;
} else if (VT.getSizeInBits() > 32)
return (VT.getSizeInBits() + 31) / 32;
@@ -935,6 +920,16 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
+ // FIXME: We should fix the ABI to be the same on targets without 16-bit
+ // support, but unless we can properly handle 3-vectors, it will be still be
+ // inconsistent.
+ if (Size == 16 && Subtarget->has16BitInsts()) {
+ RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ IntermediateVT = RegisterVT;
+ NumIntermediates = (NumElts + 1) / 2;
+ return NumIntermediates;
+ }
+
if (Size == 32) {
RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;
@@ -942,20 +937,26 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
return NumIntermediates;
}
- if (Size > 32) {
+ if (Size < 16 && Subtarget->has16BitInsts()) {
+ // FIXME: Should probably form v2i16 pieces
+ RegisterVT = MVT::i16;
+ IntermediateVT = ScalarVT;
+ NumIntermediates = NumElts;
+ return NumIntermediates;
+ }
+
+
+ if (Size != 16 && Size <= 32) {
RegisterVT = MVT::i32;
- IntermediateVT = RegisterVT;
- NumIntermediates = NumElts * ((Size + 31) / 32);
+ IntermediateVT = ScalarVT;
+ NumIntermediates = NumElts;
return NumIntermediates;
}
- // FIXME: We should fix the ABI to be the same on targets without 16-bit
- // support, but unless we can properly handle 3-vectors, it will be still be
- // inconsistent.
- if (Size == 16 && Subtarget->has16BitInsts()) {
- RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
+ if (Size > 32) {
+ RegisterVT = MVT::i32;
IntermediateVT = RegisterVT;
- NumIntermediates = (NumElts + 1) / 2;
+ NumIntermediates = NumElts * ((Size + 31) / 32);
return NumIntermediates;
}
}
@@ -1007,14 +1008,12 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (RsrcIntr->IsImage) {
- Info.ptrVal = MFI->getImagePSV(
- *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
- CI.getArgOperand(RsrcIntr->RsrcArg));
+ Info.ptrVal =
+ MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
Info.align.reset();
} else {
- Info.ptrVal = MFI->getBufferPSV(
- *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
- CI.getArgOperand(RsrcIntr->RsrcArg));
+ Info.ptrVal =
+ MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
}
Info.flags = MachineMemOperand::MODereferenceable;
@@ -1056,8 +1055,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOStore;
} else {
// Atomic
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
+ Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
+ ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable;
@@ -1091,11 +1091,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_buffer_atomic_fadd: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- Info.opc = ISD::INTRINSIC_VOID;
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.ptrVal = MFI->getBufferPSV(
- *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
- CI.getArgOperand(1));
+ Info.ptrVal =
+ MFI->getBufferPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
@@ -1105,16 +1104,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_VOID;
- Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
- ->getPointerElementType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-
- return true;
- }
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1136,10 +1125,31 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align.reset();
Info.flags = MachineMemOperand::MOLoad |
MachineMemOperand::MOStore |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
+ Info.ptrVal =
+ MFI->getImagePSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MODereferenceable;
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1175,9 +1185,13 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
+ case Intrinsic::amdgcn_ds_append:
+ case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
- case Intrinsic::amdgcn_ds_fmax: {
+ case Intrinsic::amdgcn_ds_fmax:
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
Ops.push_back(Ptr);
@@ -1234,7 +1248,7 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// assume those use MUBUF instructions. Scratch loads / stores are currently
// implemented as mubuf instructions with offen bit set, so slightly
// different than the normal addr64.
- if (!isUInt<12>(AM.BaseOffs))
+ if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
return false;
// FIXME: Since we can split immediate into soffset and immediate offset,
@@ -1355,37 +1369,77 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
}
bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
- unsigned Size, unsigned AddrSpace, unsigned Align,
+ unsigned Size, unsigned AddrSpace, Align Alignment,
MachineMemOperand::Flags Flags, bool *IsFast) const {
if (IsFast)
*IsFast = false;
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
- // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
- // aligned, 8 byte access in a single operation using ds_read2/write2_b32
- // with adjacent offsets.
- bool AlignedBy4 = (Align % 4 == 0);
+ // Check if alignment requirements for ds_read/write instructions are
+ // disabled.
+ if (Subtarget->hasUnalignedDSAccessEnabled() &&
+ !Subtarget->hasLDSMisalignedBug()) {
+ if (IsFast)
+ *IsFast = Alignment != Align(2);
+ return true;
+ }
+
+ if (Size == 64) {
+ // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
+ // aligned, 8 byte access in a single operation using ds_read2/write2_b32
+ // with adjacent offsets.
+ bool AlignedBy4 = Alignment >= Align(4);
+ if (IsFast)
+ *IsFast = AlignedBy4;
+
+ return AlignedBy4;
+ }
+ if (Size == 96) {
+ // ds_read/write_b96 require 16-byte alignment on gfx8 and older.
+ bool Aligned = Alignment >= Align(16);
+ if (IsFast)
+ *IsFast = Aligned;
+
+ return Aligned;
+ }
+ if (Size == 128) {
+ // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
+ // can do a 8 byte aligned, 16 byte access in a single operation using
+ // ds_read2/write2_b64.
+ bool Aligned = Alignment >= Align(8);
+ if (IsFast)
+ *IsFast = Aligned;
+
+ return Aligned;
+ }
+ }
+
+ if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
+ bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)
*IsFast = AlignedBy4;
- return AlignedBy4;
+ return AlignedBy4 ||
+ Subtarget->enableFlatScratch() ||
+ Subtarget->hasUnalignedScratchAccess();
}
// FIXME: We have to be conservative here and assume that flat operations
// will access scratch. If we had access to the IR function, then we
// could determine if any private memory was used in the function.
- if (!Subtarget->hasUnalignedScratchAccess() &&
- (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
- AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
- bool AlignedBy4 = Align >= 4;
+ if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
+ !Subtarget->hasUnalignedScratchAccess()) {
+ bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)
*IsFast = AlignedBy4;
return AlignedBy4;
}
- if (Subtarget->hasUnalignedBufferAccess()) {
+ if (Subtarget->hasUnalignedBufferAccessEnabled() &&
+ !(AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
@@ -1393,7 +1447,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
// 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
*IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
- Align >= 4 : Align != 2;
+ Alignment >= Align(4) : Alignment != Align(2);
}
return true;
@@ -1409,12 +1463,12 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (IsFast)
*IsFast = true;
- return Size >= 32 && Align >= 4;
+ return Size >= 32 && Alignment >= Align(4);
}
bool SITargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
- bool *IsFast) const {
+ EVT VT, unsigned AddrSpace, unsigned Alignment,
+ MachineMemOperand::Flags Flags, bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1428,7 +1482,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
}
return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Align, Flags, IsFast);
+ Align(Alignment), Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1449,11 +1503,6 @@ EVT SITargetLowering::getOptimalMemOpType(
return MVT::Other;
}
-bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
- unsigned DestAS) const {
- return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
-}
-
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
@@ -1461,6 +1510,11 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
return I && I->getMetadata("amdgpu.noclobber");
}
+bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
+ AS == AMDGPUAS::PRIVATE_ADDRESS;
+}
+
bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
// Flat -> private/local is a simple truncate.
@@ -1468,7 +1522,9 @@ bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
return true;
- return isNoopAddrSpaceCast(SrcAS, DestAS);
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+ return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
}
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
@@ -1537,7 +1593,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
- return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
+ return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
@@ -1597,9 +1653,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
// TODO: If we passed in the base kernel offset we could have a better
// alignment than 4, but we don't really need it.
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
- SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOInvariant);
+ MachineMemOperand::MOInvariant);
SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
@@ -1682,12 +1738,11 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
}
-static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
- CallingConv::ID CallConv,
- ArrayRef<ISD::InputArg> Ins,
- BitVector &Skipped,
- FunctionType *FType,
- SIMachineFunctionInfo *Info) {
+static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+ CallingConv::ID CallConv,
+ ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
+ FunctionType *FType,
+ SIMachineFunctionInfo *Info) {
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
const ISD::InputArg *Arg = &Ins[I];
@@ -1895,26 +1950,26 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
if (Info.hasImplicitBufferPtr()) {
- unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
+ Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info.hasPrivateSegmentBuffer()) {
- unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+ Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
if (Info.hasDispatchPtr()) {
- unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+ Register DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info.hasQueuePtr()) {
- unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+ Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
@@ -1929,13 +1984,13 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
}
if (Info.hasDispatchID()) {
- unsigned DispatchIDReg = Info.addDispatchID(TRI);
+ Register DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info.hasFlatScratchInit()) {
- unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+ if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
+ Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
@@ -1951,25 +2006,25 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
CallingConv::ID CallConv,
bool IsShader) const {
if (Info.hasWorkGroupIDX()) {
- unsigned Reg = Info.addWorkGroupIDX();
+ Register Reg = Info.addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDY()) {
- unsigned Reg = Info.addWorkGroupIDY();
+ Register Reg = Info.addWorkGroupIDY();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDZ()) {
- unsigned Reg = Info.addWorkGroupIDZ();
+ Register Reg = Info.addWorkGroupIDZ();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupInfo()) {
- unsigned Reg = Info.addWorkGroupInfo();
+ Register Reg = Info.addWorkGroupInfo();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
@@ -2020,26 +2075,28 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
- // If we have stack objects, we unquestionably need the private buffer
- // resource. For the Code Object V2 ABI, this will be the first 4 user
- // SGPR inputs. We can reserve those and use them directly.
+ if (!ST.enableFlatScratch()) {
+ if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
+ // If we have stack objects, we unquestionably need the private buffer
+ // resource. For the Code Object V2 ABI, this will be the first 4 user
+ // SGPR inputs. We can reserve those and use them directly.
- Register PrivateSegmentBufferReg =
- Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
- Info.setScratchRSrcReg(PrivateSegmentBufferReg);
- } else {
- unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
- // We tentatively reserve the last registers (skipping the last registers
- // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
- // we'll replace these with the ones immediately after those which were
- // really allocated. In the prologue copies will be inserted from the
- // argument to these reserved registers.
+ Register PrivateSegmentBufferReg =
+ Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+ Info.setScratchRSrcReg(PrivateSegmentBufferReg);
+ } else {
+ unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
+ // We tentatively reserve the last registers (skipping the last registers
+ // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
+ // we'll replace these with the ones immediately after those which were
+ // really allocated. In the prologue copies will be inserted from the
+ // argument to these reserved registers.
- // Without HSA, relocations are used for the scratch pointer and the
- // buffer resource setup is always inserted in the prologue. Scratch wave
- // offset is still in an input SGPR.
- Info.setScratchRSrcReg(ReservedBufferReg);
+ // Without HSA, relocations are used for the scratch pointer and the
+ // buffer resource setup is always inserted in the prologue. Scratch wave
+ // offset is still in an input SGPR.
+ Info.setScratchRSrcReg(ReservedBufferReg);
+ }
}
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -2139,7 +2196,7 @@ SDValue SITargetLowering::LowerFormalArguments(
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+ if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -2152,12 +2209,21 @@ SDValue SITargetLowering::LowerFormalArguments(
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
- bool IsShader = AMDGPU::isShader(CallConv);
+ bool IsGraphics = AMDGPU::isGraphics(CallConv);
bool IsKernel = AMDGPU::isKernel(CallConv);
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
- if (IsShader) {
- processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+ if (IsGraphics) {
+ assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
+ (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
+ !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+ !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+ !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+ !Info->hasWorkItemIDZ());
+ }
+
+ if (CallConv == CallingConv::AMDGPU_PS) {
+ processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
// At least one interpolation mode must be enabled or else the GPU will
// hang.
@@ -2172,39 +2238,28 @@ SDValue SITargetLowering::LowerFormalArguments(
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
// enabled too.
- if (CallConv == CallingConv::AMDGPU_PS) {
- if ((Info->getPSInputAddr() & 0x7F) == 0 ||
- ((Info->getPSInputAddr() & 0xF) == 0 &&
- Info->isPSInputAllocated(11))) {
- CCInfo.AllocateReg(AMDGPU::VGPR0);
- CCInfo.AllocateReg(AMDGPU::VGPR1);
- Info->markPSInputAllocated(0);
- Info->markPSInputEnabled(0);
- }
- if (Subtarget->isAmdPalOS()) {
- // For isAmdPalOS, the user does not enable some bits after compilation
- // based on run-time states; the register values being generated here are
- // the final ones set in hardware. Therefore we need to apply the
- // workaround to PSInputAddr and PSInputEnable together. (The case where
- // a bit is set in PSInputAddr but not PSInputEnable is where the
- // frontend set up an input arg for a particular interpolation mode, but
- // nothing uses that input arg. Really we should have an earlier pass
- // that removes such an arg.)
- unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
- if ((PsInputBits & 0x7F) == 0 ||
- ((PsInputBits & 0xF) == 0 &&
- (PsInputBits >> 11 & 1)))
- Info->markPSInputEnabled(
- countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
- }
+ if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->markPSInputEnabled(0);
+ }
+ if (Subtarget->isAmdPalOS()) {
+ // For isAmdPalOS, the user does not enable some bits after compilation
+ // based on run-time states; the register values being generated here are
+ // the final ones set in hardware. Therefore we need to apply the
+ // workaround to PSInputAddr and PSInputEnable together. (The case where
+ // a bit is set in PSInputAddr but not PSInputEnable is where the
+ // frontend set up an input arg for a particular interpolation mode, but
+ // nothing uses that input arg. Really we should have an earlier pass
+ // that removes such an arg.)
+ unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+ if ((PsInputBits & 0x7F) == 0 ||
+ ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
+ Info->markPSInputEnabled(
+ countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
}
-
- assert(!Info->hasDispatchPtr() &&
- !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
- !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
- !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
- !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
- !Info->hasWorkItemIDZ());
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
@@ -2253,9 +2308,23 @@ SDValue SITargetLowering::LowerFormalArguments(
const uint64_t Offset = VA.getLocMemOffset();
Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
- SDValue Arg =
- lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment,
- Ins[i].Flags.isSExt(), &Ins[i]);
+ if (Arg.Flags.isByRef()) {
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
+
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTargetMachine());
+ if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
+ Arg.Flags.getPointerAddrSpace())) {
+ Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
+ Arg.Flags.getPointerAddrSpace());
+ }
+
+ InVals.push_back(Ptr);
+ continue;
+ }
+
+ SDValue Arg = lowerKernargMemParameter(
+ DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
@@ -2337,7 +2406,7 @@ SDValue SITargetLowering::LowerFormalArguments(
// Start adding system SGPRs.
if (IsEntryFunc) {
- allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
+ allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
@@ -2820,7 +2889,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
report_fatal_error("unsupported libcall legalization");
if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- !CLI.CB->getCalledFunction()) {
+ !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
return lowerUnhandledCall(CLI, InVals,
"unsupported indirect call to function ");
}
@@ -2830,11 +2899,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported required tail call to function ");
}
- if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
- // Note the issue is with the CC of the calling function, not of the call
+ if (AMDGPU::isShader(CallConv)) {
+ // Note the issue is with the CC of the called function, not of the call
// itself.
return lowerUnhandledCall(CLI, InVals,
- "unsupported call from graphics shader of function ");
+ "unsupported call to a shader function ");
+ }
+
+ if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
+ CallConv != CallingConv::AMDGPU_Gfx) {
+ // Only allow calls with specific calling conventions.
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported calling convention for call from "
+ "graphics shader of function ");
}
if (IsTailCall) {
@@ -2865,7 +2942,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ CallConv != CallingConv::AMDGPU_Gfx) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -2894,14 +2972,16 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!IsSibCall) {
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
- SmallVector<SDValue, 4> CopyFromChains;
+ if (!Subtarget->enableFlatScratch()) {
+ SmallVector<SDValue, 4> CopyFromChains;
- // In the HSA case, this should be an identity copy.
- SDValue ScratchRSrcReg
- = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
- RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
- CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
- Chain = DAG.getTokenFactor(DL, CopyFromChains);
+ // In the HSA case, this should be an identity copy.
+ SDValue ScratchRSrcReg
+ = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
+ RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
+ Chain = DAG.getTokenFactor(DL, CopyFromChains);
+ }
}
MVT PtrVT = MVT::i32;
@@ -2992,14 +3072,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MemOpChains.push_back(Cpy);
} else {
- SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo,
- Alignment ? Alignment->value() : 0);
+ SDValue Store =
+ DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
MemOpChains.push_back(Store);
}
}
}
- if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ CallConv != CallingConv::AMDGPU_Gfx) {
// Copy special input registers after user input arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -3223,29 +3304,11 @@ Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
// If kill is not the last instruction, split the block so kill is always a
// proper terminator.
-MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
- MachineBasicBlock *BB) const {
+MachineBasicBlock *
+SITargetLowering::splitKillBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
-
- MachineBasicBlock::iterator SplitPoint(&MI);
- ++SplitPoint;
-
- if (SplitPoint == BB->end()) {
- // Don't bother with a new block.
- MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
- return BB;
- }
-
- MachineFunction *MF = BB->getParent();
- MachineBasicBlock *SplitBB
- = MF->CreateMachineBasicBlock(BB->getBasicBlock());
-
- MF->insert(++MachineFunction::iterator(BB), SplitBB);
- SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
-
- SplitBB->transferSuccessorsAndUpdatePHIs(BB);
- BB->addSuccessor(SplitBB);
-
MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
return SplitBB;
}
@@ -3357,20 +3420,14 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
// will only do one iteration. In the worst case, this will loop 64 times.
//
// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
-static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
- const SIInstrInfo *TII,
- MachineRegisterInfo &MRI,
- MachineBasicBlock &OrigBB,
- MachineBasicBlock &LoopBB,
- const DebugLoc &DL,
- const MachineOperand &IdxReg,
- unsigned InitReg,
- unsigned ResultReg,
- unsigned PhiReg,
- unsigned InitSaveExecReg,
- int Offset,
- bool UseGPRIdxMode,
- bool IsIndirectSrc) {
+static MachineBasicBlock::iterator
+emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
+ MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
+ const DebugLoc &DL, const MachineOperand &Idx,
+ unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
+ unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
+ Register &SGPRIdxReg) {
+
MachineFunction *MF = OrigBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -3396,12 +3453,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
// Read the next variant <- also loop target.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
- .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
+ .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
// Compare the just read M0 value to all possible Idx values.
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
- .addReg(CurrentIdxReg)
- .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+ .addReg(CurrentIdxReg)
+ .addReg(Idx.getReg(), 0, Idx.getSubReg());
// Update EXEC, save the original EXEC value to VCC.
BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
@@ -3412,22 +3469,14 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
MRI.setSimpleHint(NewExec, CondReg);
if (UseGPRIdxMode) {
- unsigned IdxReg;
if (Offset == 0) {
- IdxReg = CurrentIdxReg;
+ SGPRIdxReg = CurrentIdxReg;
} else {
- IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
- .addReg(CurrentIdxReg, RegState::Kill)
- .addImm(Offset);
+ SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
+ .addReg(CurrentIdxReg, RegState::Kill)
+ .addImm(Offset);
}
- unsigned IdxMode = IsIndirectSrc ?
- AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
- MachineInstr *SetOn =
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addReg(IdxReg, RegState::Kill)
- .addImm(IdxMode);
- SetOn->getOperand(3).setIsUndef();
} else {
// Move index from VCC into M0
if (Offset == 0) {
@@ -3463,14 +3512,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
// per-workitem, so is kept alive for the whole loop so we end up not re-using a
// subregister from it, using 1 more VGPR than necessary. This was saved when
// this was expanded after register allocation.
-static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
- MachineBasicBlock &MBB,
- MachineInstr &MI,
- unsigned InitResultReg,
- unsigned PhiReg,
- int Offset,
- bool UseGPRIdxMode,
- bool IsIndirectSrc) {
+static MachineBasicBlock::iterator
+loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
+ unsigned InitResultReg, unsigned PhiReg, int Offset,
+ bool UseGPRIdxMode, Register &SGPRIdxReg) {
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -3499,7 +3544,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
- Offset, UseGPRIdxMode, IsIndirectSrc);
+ Offset, UseGPRIdxMode, SGPRIdxReg);
+
MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(LoopBB);
++MBBI;
@@ -3530,64 +3576,45 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
}
-// Return true if the index is an SGPR and was set.
-static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
- MachineRegisterInfo &MRI,
- MachineInstr &MI,
- int Offset,
- bool UseGPRIdxMode,
- bool IsIndirectSrc) {
+static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI, MachineInstr &MI,
+ int Offset) {
MachineBasicBlock *MBB = MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
assert(Idx->getReg() != AMDGPU::NoRegister);
- if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
- return false;
-
- if (UseGPRIdxMode) {
- unsigned IdxMode = IsIndirectSrc ?
- AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
- if (Offset == 0) {
- MachineInstr *SetOn =
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .add(*Idx)
- .addImm(IdxMode);
+ if (Offset == 0) {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
+ } else {
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .add(*Idx)
+ .addImm(Offset);
+ }
+}
- SetOn->getOperand(3).setIsUndef();
- } else {
- Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
- .add(*Idx)
- .addImm(Offset);
- MachineInstr *SetOn =
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addReg(Tmp, RegState::Kill)
- .addImm(IdxMode);
+static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
+ MachineRegisterInfo &MRI, MachineInstr &MI,
+ int Offset) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
- SetOn->getOperand(3).setIsUndef();
- }
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
- return true;
- }
+ if (Offset == 0)
+ return Idx->getReg();
- if (Offset == 0) {
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .add(*Idx);
- } else {
- BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
.add(*Idx)
.addImm(Offset);
- }
-
- return true;
+ return Tmp;
}
-// Control flow needs to be inserted if indexing with a VGPR.
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineBasicBlock &MBB,
const GCNSubtarget &ST) {
@@ -3597,10 +3624,12 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineRegisterInfo &MRI = MF->getRegInfo();
Register Dst = MI.getOperand(0).getReg();
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
unsigned SubReg;
std::tie(SubReg, Offset)
@@ -3608,7 +3637,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
const bool UseGPRIdxMode = ST.useVGPRIndexMode();
- if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
+ // Check for a SGPR index.
+ if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
@@ -3616,14 +3646,19 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
// TODO: Look at the uses to avoid the copy. This may require rescheduling
// to avoid interfering with other uses, so probably requires a new
// optimization pass.
- BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
- .addReg(SrcReg, RegState::Implicit)
- .addReg(AMDGPU::M0, RegState::Implicit);
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
+ BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
+ .addReg(SrcReg)
+ .addReg(Idx)
+ .addImm(SubReg);
} else {
+ setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+
BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, 0, SubReg)
.addReg(SrcReg, RegState::Implicit);
}
@@ -3632,6 +3667,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
return &MBB;
}
+ // Control flow needs to be inserted if indexing with a VGPR.
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
@@ -3640,19 +3676,23 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
- Offset, UseGPRIdxMode, true);
+ Register SGPRIdxReg;
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
+ UseGPRIdxMode, SGPRIdxReg);
+
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
- BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
- .addReg(SrcReg, RegState::Implicit)
- .addReg(AMDGPU::M0, RegState::Implicit);
- BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
+
+ BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
+ .addReg(SrcReg)
+ .addReg(SGPRIdxReg)
+ .addImm(SubReg);
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
- .addReg(SrcReg, RegState::Undef, SubReg)
+ .addReg(SrcReg, 0, SubReg)
.addReg(SrcReg, RegState::Implicit);
}
@@ -3675,6 +3715,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
// This can be an immediate, but will be folded later.
assert(Val->getReg());
@@ -3700,23 +3741,36 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return &MBB;
}
- const MCInstrDesc &MovRelDesc
- = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false);
-
- if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
+ // Check for a SGPR index.
+ if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
MachineBasicBlock::iterator I(&MI);
const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MBB, I, DL, MovRelDesc, Dst)
- .addReg(SrcVec->getReg())
- .add(*Val)
- .addImm(SubReg);
- if (UseGPRIdxMode)
- BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ if (UseGPRIdxMode) {
+ Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+ BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
+ .addReg(SrcVec->getReg())
+ .add(*Val)
+ .addReg(Idx)
+ .addImm(SubReg);
+ } else {
+ setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+
+ const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
+ TRI.getRegSizeInBits(*VecRC), 32, false);
+ BuildMI(MBB, I, DL, MovRelDesc, Dst)
+ .addReg(SrcVec->getReg())
+ .add(*Val)
+ .addImm(SubReg);
+ }
MI.eraseFromParent();
return &MBB;
}
+ // Control flow needs to be inserted if indexing with a VGPR.
if (Val->isReg())
MRI.clearKillFlags(Val->getReg());
@@ -3724,16 +3778,28 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
Register PhiReg = MRI.createVirtualRegister(VecRC);
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
- Offset, UseGPRIdxMode, false);
+ Register SGPRIdxReg;
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
+ UseGPRIdxMode, SGPRIdxReg);
MachineBasicBlock *LoopBB = InsPt->getParent();
- BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
- .addReg(PhiReg)
- .add(*Val)
- .addImm(AMDGPU::sub0);
- if (UseGPRIdxMode)
- BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
+ if (UseGPRIdxMode) {
+ const MCInstrDesc &GPRIDXDesc =
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
+
+ BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
+ .addReg(PhiReg)
+ .add(*Val)
+ .addReg(SGPRIdxReg)
+ .addImm(AMDGPU::sub0);
+ } else {
+ const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
+ TRI.getRegSizeInBits(*VecRC), 32, false);
+ BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
+ .addReg(PhiReg)
+ .add(*Val)
+ .addImm(AMDGPU::sub0);
+ }
MI.eraseFromParent();
return LoopBB;
@@ -3849,7 +3915,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
- unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
.addReg(CarryReg, RegState::Define)
.add(SrcReg0Sub0)
@@ -3912,10 +3978,29 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
Src2.setReg(RegOp2);
}
- if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) {
- BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
- .addReg(Src2.getReg())
- .addImm(0);
+ const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
+ if (TRI->getRegSizeInBits(*Src2RC) == 64) {
+ if (ST.hasScalarCompareEq64()) {
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
+ .addReg(Src2.getReg())
+ .addImm(0);
+ } else {
+ const TargetRegisterClass *SubRC =
+ TRI->getSubRegClass(Src2RC, AMDGPU::sub0);
+ MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
+ MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
+ MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
+ MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
+ Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
+ .add(Src2Sub0)
+ .add(Src2Sub1);
+
+ BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
+ .addReg(Src2_32, RegState::Kill)
+ .addImm(0);
+ }
} else {
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
.addReg(Src2.getReg())
@@ -3936,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
- case AMDGPU::SI_INIT_EXEC:
- // This should be before all vector instructions.
- BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
- AMDGPU::EXEC)
- .addImm(MI.getOperand(0).getImm());
- MI.eraseFromParent();
- return BB;
-
- case AMDGPU::SI_INIT_EXEC_LO:
- // This should be before all vector instructions.
- BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
- AMDGPU::EXEC_LO)
- .addImm(MI.getOperand(0).getImm());
- MI.eraseFromParent();
- return BB;
-
- case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
- // Extract the thread count from an SGPR input and set EXEC accordingly.
- // Since BFM can't shift by 64, handle that case with CMP + CMOV.
- //
- // S_BFE_U32 count, input, {shift, 7}
- // S_BFM_B64 exec, count, 0
- // S_CMP_EQ_U32 count, 64
- // S_CMOV_B64 exec, -1
- MachineInstr *FirstMI = &*BB->begin();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- Register InputReg = MI.getOperand(0).getReg();
- Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- bool Found = false;
-
- // Move the COPY of the input reg to the beginning, so that we can use it.
- for (auto I = BB->begin(); I != &MI; I++) {
- if (I->getOpcode() != TargetOpcode::COPY ||
- I->getOperand(0).getReg() != InputReg)
- continue;
-
- if (I == FirstMI) {
- FirstMI = &*++BB->begin();
- } else {
- I->removeFromParent();
- BB->insert(FirstMI, &*I);
- }
- Found = true;
- break;
- }
- assert(Found);
- (void)Found;
-
- // This should be before all vector instructions.
- unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
- bool isWave32 = getSubtarget()->isWave32();
- unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
- .addReg(InputReg)
- .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
- BuildMI(*BB, FirstMI, DebugLoc(),
- TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
- Exec)
- .addReg(CountReg)
- .addImm(0);
- BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
- .addReg(CountReg, RegState::Kill)
- .addImm(getSubtarget()->getWavefrontSize());
- BuildMI(*BB, FirstMI, DebugLoc(),
- TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
- Exec)
- .addImm(-1);
- MI.eraseFromParent();
- return BB;
- }
-
case AMDGPU::GET_GROUPSTATICSIZE: {
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
@@ -4086,13 +4100,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::ADJCALLSTACKDOWN: {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
MachineInstrBuilder MIB(*MF, &MI);
-
- // Add an implicit use of the frame offset reg to prevent the restore copy
- // inserted after the call from being reorderd after stack operations in the
- // the caller's frame.
MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
- .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
- .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
return BB;
}
case AMDGPU::SI_CALL_ISEL: {
@@ -4111,9 +4120,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
- case AMDGPU::V_ADD_I32_e32:
- case AMDGPU::V_SUB_I32_e32:
- case AMDGPU::V_SUBREV_I32_e32: {
+ case AMDGPU::V_ADD_CO_U32_e32:
+ case AMDGPU::V_SUB_CO_U32_e32:
+ case AMDGPU::V_SUBREV_CO_U32_e32: {
// TODO: Define distinct V_*_I32_Pseudo instructions instead.
const DebugLoc &DL = MI.getDebugLoc();
unsigned Opc = MI.getOpcode();
@@ -4154,9 +4163,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return emitGWSMemViolTestLoop(MI, BB);
case AMDGPU::S_SETREG_B32: {
- if (!getSubtarget()->hasDenormModeInst())
- return BB;
-
// Try to optimize cases that only set the denormal mode or rounding mode.
//
// If the s_setreg_b32 fully sets all of the bits in the rounding mode or
@@ -4166,9 +4172,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// FIXME: This could be predicates on the immediate, but tablegen doesn't
// allow you to have a no side effect instruction in the output of a
// sideeffecting pattern.
-
- // TODO: Should also emit a no side effects pseudo if only FP bits are
- // touched, even if not all of them or to a variable.
unsigned ID, Offset, Width;
AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
if (ID != AMDGPU::Hwreg::ID_MODE)
@@ -4176,50 +4179,54 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
const unsigned SetMask = WidthMask << Offset;
- unsigned SetDenormOp = 0;
- unsigned SetRoundOp = 0;
-
- // The dedicated instructions can only set the whole denorm or round mode at
- // once, not a subset of bits in either.
- if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
- AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) {
- // If this fully sets both the round and denorm mode, emit the two
- // dedicated instructions for these.
- assert(Offset == 0);
- SetRoundOp = AMDGPU::S_ROUND_MODE;
- SetDenormOp = AMDGPU::S_DENORM_MODE;
- } else if (Width == 4) {
- if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) {
+
+ if (getSubtarget()->hasDenormModeInst()) {
+ unsigned SetDenormOp = 0;
+ unsigned SetRoundOp = 0;
+
+ // The dedicated instructions can only set the whole denorm or round mode
+ // at once, not a subset of bits in either.
+ if (SetMask ==
+ (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
+ // If this fully sets both the round and denorm mode, emit the two
+ // dedicated instructions for these.
SetRoundOp = AMDGPU::S_ROUND_MODE;
- assert(Offset == 0);
- } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) {
SetDenormOp = AMDGPU::S_DENORM_MODE;
- assert(Offset == 4);
+ } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
+ SetRoundOp = AMDGPU::S_ROUND_MODE;
+ } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
+ SetDenormOp = AMDGPU::S_DENORM_MODE;
}
- }
- if (SetRoundOp || SetDenormOp) {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
- MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
- if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
- unsigned ImmVal = Def->getOperand(1).getImm();
- if (SetRoundOp) {
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
- .addImm(ImmVal & 0xf);
+ if (SetRoundOp || SetDenormOp) {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
+ if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
+ unsigned ImmVal = Def->getOperand(1).getImm();
+ if (SetRoundOp) {
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
+ .addImm(ImmVal & 0xf);
+
+ // If we also have the denorm mode, get just the denorm mode bits.
+ ImmVal >>= 4;
+ }
- // If we also have the denorm mode, get just the denorm mode bits.
- ImmVal >>= 4;
- }
+ if (SetDenormOp) {
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
+ .addImm(ImmVal & 0xf);
+ }
- if (SetDenormOp) {
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
- .addImm(ImmVal & 0xf);
+ MI.eraseFromParent();
+ return BB;
}
-
- MI.eraseFromParent();
}
}
+ // If only FP bits are touched, used the no side effects pseudo.
+ if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
+ AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
+ MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
+
return BB;
}
default:
@@ -4256,6 +4263,12 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
}
+LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
+ return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
+ ? Ty.changeElementSize(16)
+ : Ty.changeElementSize(32);
+}
+
// Answering this is somewhat tricky and depends on the specific device which
// have different rates for fma or all f64 operations.
//
@@ -4457,6 +4470,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::UADDSAT:
+ case ISD::USUBSAT:
+ case ISD::SADDSAT:
+ case ISD::SSUBSAT:
return splitBinaryVectorOp(Op, DAG);
case ISD::SMULO:
case ISD::UMULO:
@@ -4467,31 +4484,47 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+// Used for D16: Casts the result of an instruction into the right vector,
+// packs values if loads return unpacked values.
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
const SDLoc &DL,
SelectionDAG &DAG, bool Unpacked) {
if (!LoadVT.isVector())
return Result;
+ // Cast back to the original packed type or to a larger type that is a
+ // multiple of 32 bit for D16. Widening the return type is a required for
+ // legalization.
+ EVT FittingLoadVT = LoadVT;
+ if ((LoadVT.getVectorNumElements() % 2) == 1) {
+ FittingLoadVT =
+ EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+ LoadVT.getVectorNumElements() + 1);
+ }
+
if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
// Truncate to v2i16/v4i16.
- EVT IntLoadVT = LoadVT.changeTypeToInteger();
+ EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
// Workaround legalizer not scalarizing truncate after vector op
- // legalization byt not creating intermediate vector trunc.
+ // legalization but not creating intermediate vector trunc.
SmallVector<SDValue, 4> Elts;
DAG.ExtractVectorElements(Result, Elts);
for (SDValue &Elt : Elts)
Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+ // Pad illegal v1i16/v3fi6 to v4i16
+ if ((LoadVT.getVectorNumElements() % 2) == 1)
+ Elts.push_back(DAG.getUNDEF(MVT::i16));
+
Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
// Bitcast to original type (v2f16/v4f16).
- return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
}
// Cast back to the original packed type.
- return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
}
SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
@@ -4505,10 +4538,16 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
EVT LoadVT = M->getValueType(0);
EVT EquivLoadVT = LoadVT;
- if (Unpacked && LoadVT.isVector()) {
- EquivLoadVT = LoadVT.isVector() ?
- EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- LoadVT.getVectorNumElements()) : LoadVT;
+ if (LoadVT.isVector()) {
+ if (Unpacked) {
+ EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoadVT.getVectorNumElements());
+ } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
+ // Widen v3f16 to legal type
+ EquivLoadVT =
+ EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
+ LoadVT.getVectorNumElements() + 1);
+ }
}
// Change from v4f16/v2f16 to EquivLoadVT.
@@ -4519,8 +4558,6 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
VTList, Ops, M->getMemoryVT(),
M->getMemOperand());
- if (!Unpacked) // Just adjusted the opcode.
- return Load;
SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
@@ -4724,8 +4761,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
if (Res.getOpcode() == ISD::MERGE_VALUES) {
// FIXME: Hacky
- Results.push_back(Res.getOperand(0));
- Results.push_back(Res.getOperand(1));
+ for (unsigned I = 0; I < Res.getNumOperands(); I++) {
+ Results.push_back(Res.getOperand(I));
+ }
} else {
Results.push_back(Res);
Results.push_back(Res.getValue(1));
@@ -4967,7 +5005,7 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
// Get the return address reg and mark it as an implicit live-in
- unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
+ Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
}
@@ -5063,7 +5101,7 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ Register UserSGPR = Info->getQueuePtrUserSGPR();
assert(UserSGPR != AMDGPU::NoRegister);
SDValue QueuePtr = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
@@ -5136,14 +5174,15 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// private_segment_aperture_base_hi.
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
- SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
+ SDValue Ptr =
+ DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
// be available and how do we get it?
MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
- MinAlign(64, StructOffset),
+ commonAlignment(Align(64), StructOffset),
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
}
@@ -5504,7 +5543,9 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// variable, but since the encoding of $symbol starts 4 bytes after the start
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
- // compute the correct address.
+ // compute the correct address. Similarly for the s_addc_u32 instruction, the
+ // encoding of $symbol starts 12 bytes after the start of the s_add_u32
+ // instruction.
SDValue PtrLo =
DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
SDValue PtrHi;
@@ -5512,7 +5553,7 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
} else {
PtrHi =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1);
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
}
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
@@ -5521,15 +5562,32 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
+ SDLoc DL(GSD);
+ EVT PtrVT = Op.getValueType();
+
const GlobalValue *GV = GSD->getGlobal();
if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
shouldUseLDSConstAddress(GV)) ||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
- GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+ GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+ GV->hasExternalLinkage()) {
+ Type *Ty = GV->getValueType();
+ // HIP uses an unsized array `extern __shared__ T s[]` or similar
+ // zero-sized type in other languages to declare the dynamic shared
+ // memory which size is not known at the compile time. They will be
+ // allocated by the runtime and placed directly after the static
+ // allocated ones. They all share the same offset.
+ if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+ assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
+ // Adjust alignment for that dynamic shared memory array.
+ MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+ return SDValue(
+ DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
+ }
+ }
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
-
- SDLoc DL(GSD);
- EVT PtrVT = Op.getValueType();
+ }
if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
@@ -5713,7 +5771,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
SDValue Data(Result, 0);
SDValue TexFail;
- if (IsTexFail) {
+ if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
if (MaskPopVT.isVector()) {
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
@@ -5722,10 +5780,6 @@ static SDValue constructRetValue(SelectionDAG &DAG,
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);
}
-
- TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
- SDValue(Result, 0),
- DAG.getConstant(MaskPopDwords, DL, MVT::i32));
}
if (DataDwordVT.isVector())
@@ -5735,13 +5789,27 @@ static SDValue constructRetValue(SelectionDAG &DAG,
if (IsD16)
Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
- if (!ReqRetVT.isVector())
+ EVT LegalReqRetVT = ReqRetVT;
+ if (!ReqRetVT.isVector()) {
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
+ } else {
+ // We need to widen the return vector to a legal type
+ if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
+ ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
+ LegalReqRetVT =
+ EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
+ ReqRetVT.getVectorNumElements() + 1);
+ }
+ }
+ Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
- Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
+ if (IsTexFail) {
+ TexFail =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
+ DAG.getConstant(MaskPopDwords, DL, MVT::i32));
- if (TexFail)
return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
+ }
if (Result->getNumValues() == 1)
return Data;
@@ -5798,7 +5866,7 @@ static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::ImageDimIntrinsicInfo *Intr,
- SelectionDAG &DAG) const {
+ SelectionDAG &DAG, bool WithChain) const {
SDLoc DL(Op);
MachineFunction &MF = DAG.getMachineFunction();
const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
@@ -5810,10 +5878,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
unsigned IntrOpcode = Intr->BaseOpcode;
- bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+ bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
- SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
- SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
+ SmallVector<EVT, 3> ResultTypes(Op->values());
+ SmallVector<EVT, 3> OrigResultTypes(Op->values());
bool IsD16 = false;
bool IsG16 = false;
bool IsA16 = false;
@@ -5821,7 +5889,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
int NumVDataDwords;
bool AdjustRetType = false;
- unsigned AddrIdx; // Index of first address argument
+ // Offset of intrinsic arguments
+ const unsigned ArgOffset = WithChain ? 2 : 1;
+
unsigned DMask;
unsigned DMaskLanes = 0;
@@ -5839,15 +5909,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
DMask = Is64Bit ? 0xf : 0x3;
NumVDataDwords = Is64Bit ? 4 : 2;
- AddrIdx = 4;
} else {
DMask = Is64Bit ? 0x3 : 0x1;
NumVDataDwords = Is64Bit ? 2 : 1;
- AddrIdx = 3;
}
} else {
- unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
- auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+ auto *DMaskConst =
+ cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex));
DMask = DMaskConst->getZExtValue();
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
@@ -5860,7 +5928,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
return Op; // D16 is unsupported for this instruction
IsD16 = true;
- VData = handleD16VData(VData, DAG);
+ VData = handleD16VData(VData, DAG, true);
}
NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
@@ -5880,63 +5948,56 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
(!LoadVT.isVector() && DMaskLanes > 1))
return Op;
- if (IsD16 && !Subtarget->hasUnpackedD16VMem())
+ // The sq block of gfx8 and gfx9 do not estimate register use correctly
+ // for d16 image_gather4, image_gather4_l, and image_gather4_lz
+ // instructions.
+ if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
+ !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
NumVDataDwords = (DMaskLanes + 1) / 2;
else
NumVDataDwords = DMaskLanes;
AdjustRetType = true;
}
-
- AddrIdx = DMaskIdx + 1;
}
- unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
- unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
- unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
- unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
- NumCoords + NumLCM;
- unsigned NumMIVAddrs = NumVAddrs;
-
+ unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
SmallVector<SDValue, 4> VAddrs;
// Optimize _L to _LZ when _L is zero
if (LZMappingInfo) {
- if (auto ConstantLod =
- dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+ if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>(
+ Op.getOperand(ArgOffset + Intr->LodIndex))) {
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
IntrOpcode = LZMappingInfo->LZ; // set new opcode to _lz variant of _l
- NumMIVAddrs--; // remove 'lod'
+ VAddrEnd--; // remove 'lod'
}
}
}
// Optimize _mip away, when 'lod' is zero
if (MIPMappingInfo) {
- if (auto ConstantLod =
- dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
+ if (auto *ConstantLod = dyn_cast<ConstantSDNode>(
+ Op.getOperand(ArgOffset + Intr->MipIndex))) {
if (ConstantLod->isNullValue()) {
IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
- NumMIVAddrs--; // remove 'lod'
+ VAddrEnd--; // remove 'mip'
}
}
}
// Push back extra arguments.
- for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++)
- VAddrs.push_back(Op.getOperand(AddrIdx + I));
+ for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++)
+ VAddrs.push_back(Op.getOperand(ArgOffset + I));
// Check for 16 bit addresses or derivatives and pack if true.
- unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
- unsigned CoordIdx = DimIdx + NumGradients;
- unsigned CoordsEnd = AddrIdx + NumMIVAddrs;
-
- MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
+ MVT VAddrVT =
+ Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
MVT VAddrScalarVT = VAddrVT.getScalarType();
MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
- VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType();
+ VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
VAddrScalarVT = VAddrVT.getScalarType();
IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
if (IsA16 || IsG16) {
@@ -5971,17 +6032,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
// Don't compress addresses for G16
- const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx;
- packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx,
- PackEndIdx, NumGradients);
+ const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
+ packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs,
+ ArgOffset + Intr->GradientStart, PackEndIdx,
+ Intr->NumGradients);
if (!IsA16) {
// Add uncompressed address
- for (unsigned I = CoordIdx; I < CoordsEnd; I++)
+ for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
VAddrs.push_back(Op.getOperand(I));
}
} else {
- for (unsigned I = DimIdx; I < CoordsEnd; I++)
+ for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++)
VAddrs.push_back(Op.getOperand(I));
}
@@ -6004,22 +6066,19 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
- unsigned CtrlIdx; // Index of texfailctrl argument
SDValue Unorm;
if (!BaseOpcode->Sampler) {
Unorm = True;
- CtrlIdx = AddrIdx + NumVAddrs + 1;
} else {
auto UnormConst =
- cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+ cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex));
Unorm = UnormConst->getZExtValue() ? True : False;
- CtrlIdx = AddrIdx + NumVAddrs + 3;
}
SDValue TFE;
SDValue LWE;
- SDValue TexFail = Op.getOperand(CtrlIdx);
+ SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
bool IsTexFail = false;
if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
return Op;
@@ -6066,42 +6125,40 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue DLC;
if (BaseOpcode->Atomic) {
GLC = True; // TODO no-return optimization
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
- IsGFX10 ? &DLC : nullptr))
+ if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
+ DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr))
return Op;
} else {
- if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
- IsGFX10 ? &DLC : nullptr))
+ if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
+ DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr))
return Op;
}
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
- if (UseNSA) {
- for (const SDValue &Addr : VAddrs)
- Ops.push_back(Addr);
- } else {
+ if (UseNSA)
+ append_range(Ops, VAddrs);
+ else
Ops.push_back(VAddr);
- }
- Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+ Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
if (BaseOpcode->Sampler)
- Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+ Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
- if (IsGFX10)
+ if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
- if (IsGFX10)
+ if (IsGFX10Plus)
Ops.push_back(DLC);
Ops.push_back(GLC);
Ops.push_back(SLC);
Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
- if (IsGFX10)
+ if (IsGFX10Plus)
Ops.push_back(IsA16 ? True : False);
Ops.push_back(TFE);
Ops.push_back(LWE);
- if (!IsGFX10)
+ if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
Ops.push_back(IsD16 ? True : False);
@@ -6112,7 +6169,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (IsGFX10) {
+ if (IsGFX10Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx10NSA
: AMDGPU::MIMGEncGfx10Default,
@@ -6391,11 +6448,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
- bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
+ bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
SDValue GLC;
SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
- IsGFX10 ? &DLC : nullptr))
+ IsGFX10Plus ? &DLC : nullptr))
return Op;
return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
@@ -6417,11 +6474,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
- DiagnosticInfoUnsupported BadIntrin(
- MF.getFunction(), "intrinsic not supported on subtarget",
- DL.getDebugLoc());
- DAG.getContext()->diagnose(BadIntrin);
- return DAG.getUNDEF(VT);
+ return emitRemovedIntrinsicError(DAG, DL, VT);
}
case Intrinsic::amdgcn_ldexp:
return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
@@ -6567,7 +6620,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
- return lowerImage(Op, ImageDimIntr, DAG);
+ return lowerImage(Op, ImageDimIntr, DAG, false);
return Op;
}
@@ -6597,26 +6650,59 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset,
cast<ConstantSDNode>(Offset)->getSExtValue();
}
-static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
- switch (MF.getFunction().getCallingConv()) {
- case CallingConv::AMDGPU_PS:
- return 1;
- case CallingConv::AMDGPU_VS:
- return 2;
- case CallingConv::AMDGPU_GS:
- return 3;
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_LS:
- case CallingConv::AMDGPU_ES:
- report_fatal_error("ds_ordered_count unsupported for this calling conv");
- case CallingConv::AMDGPU_CS:
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::C:
- case CallingConv::Fast:
- default:
- // Assume other calling conventions are various compute callable functions
- return 0;
- }
+SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
+ SelectionDAG &DAG,
+ unsigned NewOpcode) const {
+ SDLoc DL(Op);
+
+ SDValue VData = Op.getOperand(2);
+ auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+
+ auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+
+ EVT MemVT = VData.getValueType();
+ return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
+ M->getMemOperand());
+}
+
+SDValue
+SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
+ unsigned NewOpcode) const {
+ SDLoc DL(Op);
+
+ SDValue VData = Op.getOperand(2);
+ auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ SDValue Ops[] = {
+ Op.getOperand(0), // Chain
+ VData, // vdata
+ Op.getOperand(3), // rsrc
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ };
+
+ auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+ Ops[3]));
+
+ EVT MemVT = VData.getValueType();
+ return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
+ M->getMemOperand());
}
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
@@ -6656,7 +6742,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
report_fatal_error("ds_ordered_count: wave_done requires wave_release");
unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
- unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction());
+ unsigned ShaderType =
+ SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
unsigned Offset0 = OrderedCountIndex << 2;
unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
(Instruction << 4);
@@ -6893,7 +6980,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_buffer_atomic_or:
- case Intrinsic::amdgcn_buffer_atomic_xor: {
+ case Intrinsic::amdgcn_buffer_atomic_xor:
+ case Intrinsic::amdgcn_buffer_atomic_fadd: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
unsigned IdxEn = 1;
if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
@@ -6953,6 +7041,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_xor:
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
+ case Intrinsic::amdgcn_buffer_atomic_fadd:
+ if (!Op.getValue(0).use_empty()) {
+ DiagnosticInfoUnsupported
+ NoFpRet(DAG.getMachineFunction().getFunction(),
+ "return versions of fp atomics not supported",
+ DL.getDebugLoc(), DS_Error);
+ DAG.getContext()->diagnose(NoFpRet);
+ return SDValue();
+ }
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
+ break;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -6960,155 +7059,64 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
M->getMemOperand());
}
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
- case Intrinsic::amdgcn_raw_buffer_atomic_dec: {
- auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(5), // soffset
- Offsets.second, // offset
- Op.getOperand(6), // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
- };
- EVT VT = Op.getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
- unsigned Opcode = 0;
-
- switch (IntrID) {
- case Intrinsic::amdgcn_raw_buffer_atomic_swap:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_add:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_sub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_smin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_umin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_smax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_umax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_and:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_or:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_xor:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_inc:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
- break;
- case Intrinsic::amdgcn_raw_buffer_atomic_dec:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
- break;
- default:
- llvm_unreachable("unhandled atomic opcode");
- }
-
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_SMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_UMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_SMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_UMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
- case Intrinsic::amdgcn_struct_buffer_atomic_dec: {
- auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Offsets.first, // voffset
- Op.getOperand(6), // soffset
- Offsets.second, // offset
- Op.getOperand(7), // cachepolicy
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
- };
- EVT VT = Op.getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
- unsigned Opcode = 0;
-
- switch (IntrID) {
- case Intrinsic::amdgcn_struct_buffer_atomic_swap:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_add:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_sub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_smin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_umin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_smax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_umax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_and:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_or:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_xor:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_inc:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
- break;
- case Intrinsic::amdgcn_struct_buffer_atomic_dec:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
- break;
- default:
- llvm_unreachable("unhandled atomic opcode");
- }
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned IdxEn = 1;
@@ -7180,7 +7188,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
- case Intrinsic::amdgcn_global_atomic_csub: {
+ case Intrinsic::amdgcn_global_atomic_fadd: {
+ if (!Op.getValue(0).use_empty()) {
+ DiagnosticInfoUnsupported
+ NoFpRet(DAG.getMachineFunction().getFunction(),
+ "return versions of fp atomics not supported",
+ DL.getDebugLoc(), DS_Error);
+ DAG.getContext()->diagnose(NoFpRet);
+ return SDValue();
+ }
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
@@ -7188,15 +7204,85 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getOperand(3) // Value
};
- return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op),
- M->getVTList(), Ops, M->getMemoryVT(),
- M->getMemOperand());
+ EVT VT = Op.getOperand(3).getValueType();
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ M->getMemOperand());
}
+ case Intrinsic::amdgcn_image_bvh_intersect_ray: {
+ SDLoc DL(Op);
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue NodePtr = M->getOperand(2);
+ SDValue RayExtent = M->getOperand(3);
+ SDValue RayOrigin = M->getOperand(4);
+ SDValue RayDir = M->getOperand(5);
+ SDValue RayInvDir = M->getOperand(6);
+ SDValue TDescr = M->getOperand(7);
+
+ assert(NodePtr.getValueType() == MVT::i32 ||
+ NodePtr.getValueType() == MVT::i64);
+ assert(RayDir.getValueType() == MVT::v4f16 ||
+ RayDir.getValueType() == MVT::v4f32);
+
+ bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
+ bool Is64 = NodePtr.getValueType() == MVT::i64;
+ unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
+ : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+
+ SmallVector<SDValue, 16> Ops;
+
+ auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
+ SmallVector<SDValue, 3> Lanes;
+ DAG.ExtractVectorElements(Op, Lanes, 0, 3);
+ if (Lanes[0].getValueSizeInBits() == 32) {
+ for (unsigned I = 0; I < 3; ++I)
+ Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
+ } else {
+ if (IsAligned) {
+ Ops.push_back(
+ DAG.getBitcast(MVT::i32,
+ DAG.getBuildVector(MVT::v2f16, DL,
+ { Lanes[0], Lanes[1] })));
+ Ops.push_back(Lanes[2]);
+ } else {
+ SDValue Elt0 = Ops.pop_back_val();
+ Ops.push_back(
+ DAG.getBitcast(MVT::i32,
+ DAG.getBuildVector(MVT::v2f16, DL,
+ { Elt0, Lanes[0] })));
+ Ops.push_back(
+ DAG.getBitcast(MVT::i32,
+ DAG.getBuildVector(MVT::v2f16, DL,
+ { Lanes[1], Lanes[2] })));
+ }
+ }
+ };
+ if (Is64)
+ DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0, 2);
+ else
+ Ops.push_back(NodePtr);
+
+ Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
+ packLanes(RayOrigin, true);
+ packLanes(RayDir, true);
+ packLanes(RayInvDir, false);
+ Ops.push_back(TDescr);
+ if (IsA16)
+ Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
+ Ops.push_back(M->getChain());
+
+ auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
+ MachineMemOperand *MemRef = M->getMemOperand();
+ DAG.setNodeMemRefs(NewNode, {MemRef});
+ return SDValue(NewNode, 0);
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
- return lowerImage(Op, ImageDimIntr, DAG);
+ return lowerImage(Op, ImageDimIntr, DAG, true);
return SDValue();
}
@@ -7234,8 +7320,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
return NewOp;
}
-SDValue SITargetLowering::handleD16VData(SDValue VData,
- SelectionDAG &DAG) const {
+SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
+ bool ImageStore) const {
EVT StoreVT = VData.getValueType();
// No change for f16 and legal vector D16 types.
@@ -7243,19 +7329,70 @@ SDValue SITargetLowering::handleD16VData(SDValue VData,
return VData;
SDLoc DL(VData);
- assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+ unsigned NumElements = StoreVT.getVectorNumElements();
if (Subtarget->hasUnpackedD16VMem()) {
// We need to unpack the packed data to store.
EVT IntStoreVT = StoreVT.changeTypeToInteger();
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
- EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
- StoreVT.getVectorNumElements());
+ EVT EquivStoreVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
return DAG.UnrollVectorOp(ZExt.getNode());
}
+ // The sq block of gfx8.1 does not estimate register use correctly for d16
+ // image store instructions. The data operand is computed as if it were not a
+ // d16 image instruction.
+ if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
+ // Bitcast to i16
+ EVT IntStoreVT = StoreVT.changeTypeToInteger();
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ // Decompose into scalars
+ SmallVector<SDValue, 4> Elts;
+ DAG.ExtractVectorElements(IntVData, Elts);
+
+ // Group pairs of i16 into v2i16 and bitcast to i32
+ SmallVector<SDValue, 4> PackedElts;
+ for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
+ SDValue Pair =
+ DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
+ SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
+ PackedElts.push_back(IntPair);
+ }
+ if ((NumElements % 2) == 1) {
+ // Handle v3i16
+ unsigned I = Elts.size() / 2;
+ SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
+ {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
+ SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
+ PackedElts.push_back(IntPair);
+ }
+
+ // Pad using UNDEF
+ PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
+
+ // Build final vector
+ EVT VecVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
+ return DAG.getBuildVector(VecVT, DL, PackedElts);
+ }
+
+ if (NumElements == 3) {
+ EVT IntStoreVT =
+ EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ EVT WidenedStoreVT = EVT::getVectorVT(
+ *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
+ EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
+ WidenedStoreVT.getStoreSizeInBits());
+ SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
+ return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
+ }
+
assert(isTypeLegal(StoreVT));
return VData;
}
@@ -7433,8 +7570,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
EVT VDataVT = VData.getValueType();
EVT EltType = VDataVT.getScalarType();
bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
- if (IsD16)
+ if (IsD16) {
VData = handleD16VData(VData, DAG);
+ VDataVT = VData.getValueType();
+ }
if (!isTypeLegal(VDataVT)) {
VData =
@@ -7478,8 +7617,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
EVT EltType = VDataVT.getScalarType();
bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
- if (IsD16)
+ if (IsD16) {
VData = handleD16VData(VData, DAG);
+ VDataVT = VData.getValueType();
+ }
if (!isTypeLegal(VDataVT)) {
VData =
@@ -7514,57 +7655,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
-
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
- SDValue Ops[] = {
- Chain,
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
- EVT VT = Op.getOperand(2).getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
- unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
- : AMDGPUISD::BUFFER_ATOMIC_FADD;
-
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
-
- case Intrinsic::amdgcn_global_atomic_fadd: {
- SDValue Ops[] = {
- Chain,
- Op.getOperand(2), // ptr
- Op.getOperand(3) // vdata
- };
- EVT VT = Op.getOperand(3).getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- if (VT.isVector()) {
- return DAG.getMemIntrinsicNode(
- AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
-
- return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
- DAG.getVTList(VT, MVT::Other), Ops,
- M->getMemOperand()).getValue(1);
- }
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
@@ -7572,7 +7662,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
- return lowerImage(Op, ImageDimIntr, DAG);
+ return lowerImage(Op, ImageDimIntr, DAG, true);
return Op;
}
@@ -7848,13 +7938,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- MemVT, *Load->getMemOperand())) {
- SDValue Ops[2];
- std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
- return DAG.getMergeValues(Ops, DL);
- }
-
unsigned Alignment = Load->getAlignment();
unsigned AS = Load->getAddressSpace();
if (Subtarget->hasLDSMisalignedBug() &&
@@ -7879,9 +7962,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
- if (NumElements == 3)
- return WidenVectorLoad(Op, DAG);
- return SplitVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
}
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
@@ -7897,9 +7978,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
Alignment >= 4 && NumElements < 32) {
if (MemVT.isPow2VectorType())
return SDValue();
- if (NumElements == 3)
- return WidenVectorLoad(Op, DAG);
- return SplitVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
}
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
@@ -7914,7 +7993,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorLoad(Op, DAG);
// v3 loads not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
- return WidenVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
+
// v3 and v4 loads are supported for private and global memory.
return SDValue();
}
@@ -7938,15 +8018,19 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorLoad(Op, DAG);
// v3 loads not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
- return WidenVectorLoad(Op, DAG);
+ return WidenOrSplitVectorLoad(Op, DAG);
+
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_read_b128 if possible.
- if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
- MemVT.getStoreSize() == 16)
+ // Use ds_read_b128 or ds_read_b96 when possible.
+ if (Subtarget->hasDS96AndDS128() &&
+ ((Subtarget->useDS128() && MemVT.getStoreSize() == 16) ||
+ MemVT.getStoreSize() == 12) &&
+ allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
+ Load->getAlign()))
return SDValue();
if (NumElements > 2)
@@ -7963,6 +8047,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorLoad(Op, DAG);
}
}
+
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ MemVT, *Load->getMemOperand())) {
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+ return DAG.getMergeValues(Ops, DL);
+ }
+
return SDValue();
}
@@ -8003,8 +8095,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
- Flags.hasApproximateFuncs();
+ bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
@@ -8045,6 +8136,33 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
}
+SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+ EVT VT = Op.getValueType();
+ const SDNodeFlags Flags = Op->getFlags();
+
+ bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
+ DAG.getTarget().Options.UnsafeFPMath;
+ if (!AllowInaccurateDiv)
+ return SDValue();
+
+ SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
+ SDValue One = DAG.getConstantFP(1.0, SL, VT);
+
+ SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
+ SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+
+ R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
+ SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
+ R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
+ SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
+ SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
+ return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
+}
+
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
EVT VT, SDValue A, SDValue B, SDValue GlueChain,
SDNodeFlags Flags) {
@@ -8273,8 +8391,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
- if (DAG.getTarget().Options.UnsafeFPMath)
- return lowerFastUnsafeFDIV(Op, DAG);
+ if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
+ return FastLowered;
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
@@ -8368,11 +8486,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.isVector() &&
Store->getValue().getValueType().getScalarType() == MVT::i32);
- if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
- VT, *Store->getMemOperand())) {
- return expandUnalignedStore(Store, DAG);
- }
-
unsigned AS = Store->getAddressSpace();
if (Subtarget->hasLDSMisalignedBug() &&
AS == AMDGPUAS::FLAT_ADDRESS &&
@@ -8397,6 +8510,11 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// v3 stores not supported on SI.
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
return SplitVectorStore(Op, DAG);
+
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand()))
+ return expandUnalignedStore(Store, DAG);
+
return SDValue();
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
switch (Subtarget->getMaxPrivateElementSize()) {
@@ -8407,16 +8525,20 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorStore(Op, DAG);
return SDValue();
case 16:
- if (NumElements > 4 || NumElements == 3)
+ if (NumElements > 4 ||
+ (NumElements == 3 && !Subtarget->enableFlatScratch()))
return SplitVectorStore(Op, DAG);
return SDValue();
default:
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- // Use ds_write_b128 if possible.
- if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
- VT.getStoreSize() == 16 && NumElements != 3)
+ // Use ds_write_b128 or ds_write_b96 when possible.
+ if (Subtarget->hasDS96AndDS128() &&
+ ((Subtarget->useDS128() && VT.getStoreSize() == 16) ||
+ (VT.getStoreSize() == 12)) &&
+ allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
+ Store->getAlign()))
return SDValue();
if (NumElements > 2)
@@ -8433,6 +8555,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SplitVectorStore(Op, DAG);
}
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand())) {
+ if (VT.isVector())
+ return SplitVectorStore(Op, DAG);
+ return expandUnalignedStore(Store, DAG);
+ }
+
return SDValue();
} else {
llvm_unreachable("unhandled address space");
@@ -8474,7 +8603,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
unsigned AS = AtomicNode->getAddressSpace();
// No custom lowering required for local address space
- if (!isFlatGlobalAddrSpace(AS))
+ if (!AMDGPU::isFlatGlobalAddrSpace(AS))
return Op;
// Non-local address space requires custom lowering for atomic compare
@@ -8584,7 +8713,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
EVT VT = N->getValueType(0);
SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
- SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
+ SDValue COffset = DAG.getConstant(Offset, SL, VT);
SDNodeFlags Flags;
Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
@@ -8594,12 +8723,28 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
+/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
+/// by the chain and intrinsic ID. Theoretically we would also need to check the
+/// specific intrinsic, but they all place the pointer operand first.
+static unsigned getBasePtrIndex(const MemSDNode *N) {
+ switch (N->getOpcode()) {
+ case ISD::STORE:
+ case ISD::INTRINSIC_W_CHAIN:
+ case ISD::INTRINSIC_VOID:
+ return 2;
+ default:
+ return 1;
+ }
+}
+
SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
DAGCombinerInfo &DCI) const {
- SDValue Ptr = N->getBasePtr();
SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
+ unsigned PtrIdx = getBasePtrIndex(N);
+ SDValue Ptr = N->getOperand(PtrIdx);
+
// TODO: We could also do this for multiplies.
if (Ptr.getOpcode() == ISD::SHL) {
SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
@@ -8607,7 +8752,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
if (NewPtr) {
SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
- NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
+ NewOps[PtrIdx] = NewPtr;
return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
}
}
@@ -8868,7 +9013,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
// and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
- N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
uint32_t LHSMask = getPermuteMask(DAG, LHS);
uint32_t RHSMask = getPermuteMask(DAG, RHS);
if (LHSMask != ~0u && RHSMask != ~0u) {
@@ -8965,7 +9110,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
// or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
- N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
uint32_t LHSMask = getPermuteMask(DAG, LHS);
uint32_t RHSMask = getPermuteMask(DAG, RHS);
if (LHSMask != ~0u && RHSMask != ~0u) {
@@ -10509,8 +10654,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
return SDValue();
switch (N->getOpcode()) {
- default:
- return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
case ISD::ADD:
return performAddCombine(N, DCI);
case ISD::SUB:
@@ -10537,35 +10680,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performMinMaxCombine(N, DCI);
case ISD::FMA:
return performFMACombine(N, DCI);
- case ISD::LOAD: {
- if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
- return Widended;
- LLVM_FALLTHROUGH;
- }
- case ISD::STORE:
- case ISD::ATOMIC_LOAD:
- case ISD::ATOMIC_STORE:
- case ISD::ATOMIC_CMP_SWAP:
- case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
- case ISD::ATOMIC_SWAP:
- case ISD::ATOMIC_LOAD_ADD:
- case ISD::ATOMIC_LOAD_SUB:
- case ISD::ATOMIC_LOAD_AND:
- case ISD::ATOMIC_LOAD_OR:
- case ISD::ATOMIC_LOAD_XOR:
- case ISD::ATOMIC_LOAD_NAND:
- case ISD::ATOMIC_LOAD_MIN:
- case ISD::ATOMIC_LOAD_MAX:
- case ISD::ATOMIC_LOAD_UMIN:
- case ISD::ATOMIC_LOAD_UMAX:
- case ISD::ATOMIC_LOAD_FADD:
- case AMDGPUISD::ATOMIC_INC:
- case AMDGPUISD::ATOMIC_DEC:
- case AMDGPUISD::ATOMIC_LOAD_FMIN:
- case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
- if (DCI.isBeforeLegalize())
- break;
- return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
case ISD::AND:
return performAndCombine(N, DCI);
case ISD::OR:
@@ -10630,14 +10744,28 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performExtractVectorEltCombine(N, DCI);
case ISD::INSERT_VECTOR_ELT:
return performInsertVectorEltCombine(N, DCI);
+ case ISD::LOAD: {
+ if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+ return Widended;
+ LLVM_FALLTHROUGH;
+ }
+ default: {
+ if (!DCI.isBeforeLegalize()) {
+ if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
+ return performMemSDNodeCombine(MemNode, DCI);
+ }
+
+ break;
}
+ }
+
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
/// Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {
- default: return 0;
+ default: return ~0u;
case AMDGPU::sub0: return 0;
case AMDGPU::sub1: return 1;
case AMDGPU::sub2: return 2;
@@ -10697,6 +10825,8 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
// set, etc.
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+ if (Lane == ~0u)
+ return Node;
// Check if the use is for the TFE/LWE generated result at VGPRn+1.
if (UsesTFC && Lane == TFCLane) {
@@ -10826,8 +10956,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
// to try understanding copies to physical registers.
- if (SrcVal.getValueType() == MVT::i1 &&
- Register::isPhysicalRegister(DestReg->getReg())) {
+ if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
SDLoc SL(Node);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue VReg = DAG.getRegister(
@@ -10870,7 +10999,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
unsigned Opcode = Node->getMachineOpcode();
if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
- !TII->isGather4(Opcode)) {
+ !TII->isGather4(Opcode) &&
+ AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) != -1) {
return adjustWritemask(Node, DAG);
}
@@ -10881,14 +11011,14 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
}
switch (Opcode) {
- case AMDGPU::V_DIV_SCALE_F32:
- case AMDGPU::V_DIV_SCALE_F64: {
+ case AMDGPU::V_DIV_SCALE_F32_e64:
+ case AMDGPU::V_DIV_SCALE_F64_e64: {
// Satisfy the operand register constraint when one of the inputs is
// undefined. Ordinarily each undef value will have its own implicit_def of
// a vreg, so force these to use a single register.
- SDValue Src0 = Node->getOperand(0);
- SDValue Src1 = Node->getOperand(1);
- SDValue Src2 = Node->getOperand(2);
+ SDValue Src0 = Node->getOperand(1);
+ SDValue Src1 = Node->getOperand(3);
+ SDValue Src2 = Node->getOperand(5);
if ((Src0.isMachineOpcode() &&
Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
@@ -10923,10 +11053,10 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
} else
break;
- SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
- for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
- Ops.push_back(Node->getOperand(I));
-
+ SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
+ Ops[1] = Src0;
+ Ops[3] = Src1;
+ Ops[5] = Src2;
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
@@ -10962,8 +11092,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MachineOperand &Op = MI.getOperand(I);
if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
- !Register::isVirtualRegister(Op.getReg()) ||
- !TRI->isAGPR(MRI, Op.getReg()))
+ !Op.getReg().isVirtual() || !TRI->isAGPR(MRI, Op.getReg()))
continue;
auto *Src = MRI.getUniqueVRegDef(Op.getReg());
if (!Src || !Src->isCopy() ||
@@ -10985,8 +11114,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
if (!Node->hasAnyUseOfValue(0)) {
- MI.setDesc(TII->get(NoRetAtomicOp));
+ int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::glc1);
+ if (Glc1Idx != -1)
+ MI.RemoveOperand(Glc1Idx);
MI.RemoveOperand(0);
+ MI.setDesc(TII->get(NoRetAtomicOp));
return;
}
@@ -11341,17 +11474,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
Info->limitOccupancy(MF);
if (ST.isWave32() && !MF.empty()) {
- // Add VCC_HI def because many instructions marked as imp-use VCC where
- // we may only define VCC_LO. If nothing defines VCC_HI we may end up
- // having a use of undef.
-
const SIInstrInfo *TII = ST.getInstrInfo();
- DebugLoc DL;
-
- MachineBasicBlock &MBB = MF.front();
- MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr();
- BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI);
-
for (auto &MBB : MF) {
for (auto &MI : MBB) {
TII->fixImplicitOperands(MI);
@@ -11379,6 +11502,55 @@ void SITargetLowering::computeKnownBitsForFrameIndex(
Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
}
+static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
+ KnownBits &Known, unsigned Dim) {
+ unsigned MaxValue =
+ ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
+ Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+}
+
+void SITargetLowering::computeKnownBitsForTargetInstr(
+ GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
+ const MachineRegisterInfo &MRI, unsigned Depth) const {
+ const MachineInstr *MI = MRI.getVRegDef(R);
+ switch (MI->getOpcode()) {
+ case AMDGPU::G_INTRINSIC: {
+ switch (MI->getIntrinsicID()) {
+ case Intrinsic::amdgcn_workitem_id_x:
+ knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
+ break;
+ case Intrinsic::amdgcn_workitem_id_y:
+ knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
+ break;
+ case Intrinsic::amdgcn_workitem_id_z:
+ knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
+ break;
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::amdgcn_mbcnt_hi: {
+ // These return at most the wavefront size - 1.
+ unsigned Size = MRI.getType(R).getSizeInBits();
+ Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
+ break;
+ }
+ case Intrinsic::amdgcn_groupstaticsize: {
+ // We can report everything over the maximum size as 0. We can't report
+ // based on the actual size because we don't know if it's accurate or not
+ // at any given point.
+ Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));
+ break;
+ }
+ }
+ break;
+ }
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+ Known.Zero.setHighBits(24);
+ break;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+ Known.Zero.setHighBits(16);
+ break;
+ }
+}
+
Align SITargetLowering::computeKnownAlignForTargetInstr(
GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
unsigned Depth) const {
@@ -11484,46 +11656,40 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
return false;
}
-bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
- FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
-{
+bool SITargetLowering::isSDNodeSourceOfDivergence(
+ const SDNode *N, FunctionLoweringInfo *FLI,
+ LegacyDivergenceAnalysis *KDA) const {
switch (N->getOpcode()) {
- case ISD::CopyFromReg:
- {
- const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
- const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
- const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- Register Reg = R->getReg();
+ case ISD::CopyFromReg: {
+ const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
+ const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ Register Reg = R->getReg();
- // FIXME: Why does this need to consider isLiveIn?
- if (Reg.isPhysical() || MRI.isLiveIn(Reg))
- return !TRI->isSGPRReg(MRI, Reg);
+ // FIXME: Why does this need to consider isLiveIn?
+ if (Reg.isPhysical() || MRI.isLiveIn(Reg))
+ return !TRI->isSGPRReg(MRI, Reg);
- if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
- return KDA->isDivergent(V);
+ if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
+ return KDA->isDivergent(V);
- assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
- return !TRI->isSGPRReg(MRI, Reg);
- }
- break;
- case ISD::LOAD: {
- const LoadSDNode *L = cast<LoadSDNode>(N);
- unsigned AS = L->getAddressSpace();
- // A flat load may access private memory.
- return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
- } break;
- case ISD::CALLSEQ_END:
+ assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
+ return !TRI->isSGPRReg(MRI, Reg);
+ }
+ case ISD::LOAD: {
+ const LoadSDNode *L = cast<LoadSDNode>(N);
+ unsigned AS = L->getAddressSpace();
+ // A flat load may access private memory.
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ }
+ case ISD::CALLSEQ_END:
return true;
- break;
- case ISD::INTRINSIC_WO_CHAIN:
- {
-
- }
- return AMDGPU::isIntrinsicSourceOfDivergence(
- cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
- case ISD::INTRINSIC_W_CHAIN:
- return AMDGPU::isIntrinsicSourceOfDivergence(
- cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ case ISD::INTRINSIC_WO_CHAIN:
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+ case ISD::INTRINSIC_W_CHAIN:
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
}
return false;
}
@@ -11558,6 +11724,16 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}
+// Global FP atomic instructions have a hardcoded FP mode and do not support
+// FP32 denormals, and only support v2f16 denormals.
+static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
+ const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
+ auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
+ if (&Flt == &APFloat::IEEEsingle())
+ return DenormMode == DenormalMode::getPreserveSign();
+ return DenormMode == DenormalMode::getIEEE();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
switch (RMW->getOperation()) {
@@ -11576,10 +11752,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
+ if (!fpModeMatchesGlobalFPAtomicMode(RMW))
+ return AtomicExpansionKind::CmpXChg;
+
return RMW->use_empty() ? AtomicExpansionKind::None :
AtomicExpansionKind::CmpXChg;
}
+ // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
+ // to round-to-nearest-even.
return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
}