aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-07-29 20:15:26 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-07-29 20:15:26 +0000
commit344a3780b2e33f6ca763666c380202b18aab72a3 (patch)
treef0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AMDGPU/SIISelLowering.cpp
parentb60736ec1405bb0a8dd40989f67ef4c93da068ab (diff)
downloadsrc-344a3780b2e33f6ca763666c380202b18aab72a3.tar.gz
src-344a3780b2e33f6ca763666c380202b18aab72a3.zip
the upstream release/13.x branch was created.
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp1218
1 files changed, 801 insertions, 417 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 839437b5e3f8..d98acfc6c532 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,11 +19,13 @@
#include "SIRegisterInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/CommandLine.h"
@@ -80,36 +82,49 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
- addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+
+ const SIRegisterInfo *TRI = STI.getRegisterInfo();
+ const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
+
+ addRegisterClass(MVT::f64, V64RegClass);
+ addRegisterClass(MVT::v2f32, V64RegClass);
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
+ addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+ addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
+ addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
+
+ addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
+ addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
+
+ addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
+ addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+ addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+ addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
@@ -123,7 +138,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
+ addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
computeRegisterProperties(Subtarget->getRegisterInfo());
@@ -139,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v6i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v7i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
setOperationAction(ISD::LOAD, MVT::i1, Custom);
@@ -148,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v3i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v5i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v6i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v7i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
@@ -170,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
@@ -197,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);
+ setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
@@ -239,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// with > 4 elements.
for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
+ MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
@@ -249,10 +279,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
case ISD::BITCAST:
case ISD::EXTRACT_VECTOR_ELT:
case ISD::INSERT_VECTOR_ELT:
- case ISD::INSERT_SUBVECTOR:
case ISD::EXTRACT_SUBVECTOR:
case ISD::SCALAR_TO_VECTOR:
break;
+ case ISD::INSERT_SUBVECTOR:
case ISD::CONCAT_VECTORS:
setOperationAction(Op, VT, Custom);
break;
@@ -284,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
+ for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
+ }
+
for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
@@ -336,17 +380,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Avoid stack access for these.
// TODO: Generalize to more vector types.
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
-
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
@@ -362,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
- // Deal with vec5 vector operations when widened to vec8.
+ // Deal with vec5/6/7 vector operations when widened to vec8.
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
@@ -384,6 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
// FIXME: This should be narrowed to i32, but that only happens if i64 is
// illegal.
@@ -525,8 +571,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
- setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
@@ -718,6 +764,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+
+ if (Subtarget->hasPackedFP32Ops()) {
+ setOperationAction(ISD::FADD, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f32, Legal);
+ setOperationAction(ISD::FMA, MVT::v2f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f32, Legal);
+
+ for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {
+ setOperationAction(ISD::FADD, VT, Custom);
+ setOperationAction(ISD::FMUL, VT, Custom);
+ setOperationAction(ISD::FMA, VT, Custom);
+ }
+ }
}
setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
@@ -1128,17 +1187,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags = MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile;
- return true;
- }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1150,6 +1198,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MODereferenceable;
return true;
}
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOVolatile;
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1191,6 +1255,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_csub: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
@@ -1210,9 +1277,9 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
}
return AM.Scale == 0 &&
- (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
- AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS,
- /*Signed=*/false));
+ (AM.BaseOffs == 0 ||
+ Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
@@ -1220,7 +1287,7 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
return AM.Scale == 0 &&
(AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
- /*Signed=*/true));
+ SIInstrFlags::FlatGlobal));
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
// Assume the we will use FLAT for all global memory accesses
@@ -1385,10 +1452,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return true;
}
+ // Either, the alignment requirements are "enabled", or there is an
+ // unaligned LDS access related hardware bug though alignment requirements
+ // are "disabled". In either case, we need to check for proper alignment
+ // requirements.
+ //
if (Size == 64) {
- // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
- // aligned, 8 byte access in a single operation using ds_read2/write2_b32
- // with adjacent offsets.
+ // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
+ // can do a 4 byte aligned, 8 byte access in a single operation using
+ // ds_read2/write2_b32 with adjacent offsets.
bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)
*IsFast = AlignedBy4;
@@ -1396,22 +1468,23 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return AlignedBy4;
}
if (Size == 96) {
- // ds_read/write_b96 require 16-byte alignment on gfx8 and older.
- bool Aligned = Alignment >= Align(16);
+ // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
+ // gfx8 and older.
+ bool AlignedBy16 = Alignment >= Align(16);
if (IsFast)
- *IsFast = Aligned;
+ *IsFast = AlignedBy16;
- return Aligned;
+ return AlignedBy16;
}
if (Size == 128) {
- // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
- // can do a 8 byte aligned, 16 byte access in a single operation using
- // ds_read2/write2_b64.
- bool Aligned = Alignment >= Align(8);
+ // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
+ // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
+ // single operation using ds_read2/write2_b64.
+ bool AlignedBy8 = Alignment >= Align(8);
if (IsFast)
- *IsFast = Aligned;
+ *IsFast = AlignedBy8;
- return Aligned;
+ return AlignedBy8;
}
}
@@ -1467,8 +1540,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
}
bool SITargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Alignment,
- MachineMemOperand::Flags Flags, bool *IsFast) const {
+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
if (IsFast)
*IsFast = false;
@@ -1482,7 +1555,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
}
return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
- Align(Alignment), Flags, IsFast);
+ Alignment, Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1535,8 +1608,8 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(MVT VT) const {
- int NumElts = VT.getVectorNumElements();
- if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+ VT.getScalarType().bitsLE(MVT::i16))
return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
@@ -1799,23 +1872,37 @@ void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
+ unsigned Mask = (Subtarget->hasPackedTID() &&
+ Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
}
if (Info.hasWorkItemIDY()) {
- Register Reg = AMDGPU::VGPR1;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+ assert(Info.hasWorkItemIDX());
+ if (Subtarget->hasPackedTID()) {
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+ 0x3ff << 10));
+ } else {
+ unsigned Reg = AMDGPU::VGPR1;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
+ }
}
if (Info.hasWorkItemIDZ()) {
- Register Reg = AMDGPU::VGPR2;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
+ assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
+ if (Subtarget->hasPackedTID()) {
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
+ 0x3ff << 20));
+ } else {
+ unsigned Reg = AMDGPU::VGPR2;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ CCInfo.AllocateReg(Reg);
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
+ }
}
}
@@ -1865,12 +1952,32 @@ static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
return ArgDescriptor::createRegister(Reg);
}
-static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
+// If this has a fixed position, we still should allocate the register in the
+// CCInfo state. Technically we could get away with this for values passed
+// outside of the normal argument range.
+static void allocateFixedSGPRInputImpl(CCState &CCInfo,
+ const TargetRegisterClass *RC,
+ MCRegister Reg) {
+ Reg = CCInfo.AllocateReg(Reg);
+ assert(Reg != AMDGPU::NoRegister);
+ MachineFunction &MF = CCInfo.getMachineFunction();
+ MF.addLiveIn(Reg, RC);
+}
+
+static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
+ if (Arg) {
+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
+ Arg.getRegister());
+ } else
+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
}
-static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
+static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
+ if (Arg) {
+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
+ Arg.getRegister());
+ } else
+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
/// Allocate implicit function VGPR arguments at the end of allocated user
@@ -1919,29 +2026,29 @@ void SITargetLowering::allocateSpecialInputSGPRs(
// TODO: Unify handling with private memory pointers.
if (Info.hasDispatchPtr())
- ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
if (Info.hasQueuePtr())
- ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
// constant offset from the kernarg segment.
if (Info.hasImplicitArgPtr())
- ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
if (Info.hasDispatchID())
- ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
// flat_scratch_init is not applicable for non-kernel functions.
if (Info.hasWorkGroupIDX())
- ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
if (Info.hasWorkGroupIDY())
- ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
if (Info.hasWorkGroupIDZ())
- ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
}
// Allocate special inputs passed in user SGPRs.
@@ -2203,6 +2310,8 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
+ Info->allocateModuleLDSGlobal(Fn.getParent());
+
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
@@ -2767,6 +2876,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) {
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
+ case CallingConv::AMDGPU_Gfx:
return true;
default:
return canGuaranteeTCO(CC);
@@ -2781,6 +2891,11 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
if (!mayTailCallThisCC(CalleeCC))
return false;
+ // For a divergent call target, we need to do a waterfall loop over the
+ // possible callees which precludes us from using a simple jump.
+ if (Callee->isDivergent())
+ return false;
+
MachineFunction &MF = DAG.getMachineFunction();
const Function &CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF.getCallingConv();
@@ -2888,12 +3003,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (!CLI.CB)
report_fatal_error("unsupported libcall legalization");
- if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
- !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {
- return lowerUnhandledCall(CLI, InVals,
- "unsupported indirect call to function ");
- }
-
if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
return lowerUnhandledCall(CLI, InVals,
"unsupported required tail call to function ");
@@ -3054,7 +3163,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// locations, which are supposed to be immutable?
Chain = addTokenForArgument(Chain, DAG, MFI, FI);
} else {
- DstAddr = PtrOff;
+ // Stores to the argument stack area are relative to the stack pointer.
+ SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
+ MVT::i32);
+ DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
Alignment =
commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
@@ -4150,11 +4262,35 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return BB;
}
case AMDGPU::DS_GWS_INIT:
- case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_BR:
+ case AMDGPU::DS_GWS_BARRIER:
+ if (Subtarget->needsAlignedVGPRs()) {
+ // Add implicit aligned super-reg to force alignment on the data operand.
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
+ Register DataReg = Op->getReg();
+ bool IsAGPR = TRI->isAGPR(MRI, DataReg);
+ Register Undef = MRI.createVirtualRegister(
+ IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);
+ Register NewVR =
+ MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
+ : &AMDGPU::VReg_64_Align2RegClass);
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR)
+ .addReg(DataReg, 0, Op->getSubReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Undef)
+ .addImm(AMDGPU::sub1);
+ Op->setReg(NewVR);
+ Op->setSubReg(AMDGPU::sub0);
+ MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
+ }
+ LLVM_FALLTHROUGH;
+ case AMDGPU::DS_GWS_SEMA_V:
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
- case AMDGPU::DS_GWS_BARRIER:
// A s_waitcnt 0 is required to be the instruction immediately following.
if (getSubtarget()->hasGWSAutoReplay()) {
bundleInstWithWaitcnt(MI);
@@ -4360,7 +4496,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4381,7 +4518,8 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4456,6 +4594,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerFMINNUM_FMAXNUM(Op, DAG);
case ISD::FMA:
return splitTernaryVectorOp(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -5092,12 +5233,35 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
+ if (!Subtarget->isTrapHandlerEnabled() ||
+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
+ return lowerTrapEndpgm(Op, DAG);
+
+ if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {
+ switch (*HsaAbiVer) {
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ return lowerTrapHsaQueuePtr(Op, DAG);
+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
+ return Subtarget->supportsGetDoorbellID() ?
+ lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
+ }
+ }
+
+ llvm_unreachable("Unknown trap handler");
+}
+
+SDValue SITargetLowering::lowerTrapEndpgm(
+ SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+}
- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !Subtarget->isTrapHandlerEnabled())
- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+SDValue SITargetLowering::lowerTrapHsaQueuePtr(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
@@ -5108,22 +5272,37 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
QueuePtr, SDValue());
+
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
SDValue Ops[] = {
ToReg,
- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+ DAG.getTargetConstant(TrapID, SL, MVT::i16),
SGPR01,
ToReg.getValue(1)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
+SDValue SITargetLowering::lowerTrapHsa(
+ SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
+
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
+ SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(TrapID, SL, MVT::i16)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
+
SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();
- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
- !Subtarget->isTrapHandlerEnabled()) {
+ if (!Subtarget->isTrapHandlerEnabled() ||
+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
@@ -5133,9 +5312,10 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
return Chain;
}
+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
SDValue Ops[] = {
Chain,
- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+ DAG.getTargetConstant(TrapID, SL, MVT::i16)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
@@ -5666,23 +5846,10 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
ArrayRef<SDValue> Elts) {
assert(!Elts.empty());
MVT Type;
- unsigned NumElts;
-
- if (Elts.size() == 1) {
- Type = MVT::f32;
- NumElts = 1;
- } else if (Elts.size() == 2) {
- Type = MVT::v2f32;
- NumElts = 2;
- } else if (Elts.size() == 3) {
- Type = MVT::v3f32;
- NumElts = 3;
- } else if (Elts.size() <= 4) {
- Type = MVT::v4f32;
- NumElts = 4;
- } else if (Elts.size() <= 8) {
- Type = MVT::v8f32;
- NumElts = 8;
+ unsigned NumElts = Elts.size();
+
+ if (NumElts <= 8) {
+ Type = MVT::getVectorVT(MVT::f32, NumElts);
} else {
assert(Elts.size() <= 16);
Type = MVT::v16f32;
@@ -5704,28 +5871,6 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
return DAG.getBuildVector(Type, DL, VecElts);
}
-static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
- SDValue *GLC, SDValue *SLC, SDValue *DLC) {
- auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
-
- uint64_t Value = CachePolicyConst->getZExtValue();
- SDLoc DL(CachePolicy);
- if (GLC) {
- *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x1;
- }
- if (SLC) {
- *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x2;
- }
- if (DLC) {
- *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
- Value &= ~(uint64_t)0x4;
- }
-
- return Value == 0;
-}
-
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
SDValue Src, int ExtraElts) {
EVT SrcVT = Src.getValueType();
@@ -5752,7 +5897,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
ArrayRef<EVT> ResultTypes,
bool IsTexFail, bool Unpacked, bool IsD16,
int DMaskPop, int NumVDataDwords,
- const SDLoc &DL, LLVMContext &Context) {
+ const SDLoc &DL) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
@@ -5835,11 +5980,11 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
return Value == 0;
}
-static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
- MVT PackVectorVT,
- SmallVectorImpl<SDValue> &PackedAddrs,
- unsigned DimIdx, unsigned EndIdx,
- unsigned NumGradients) {
+static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
+ MVT PackVectorVT,
+ SmallVectorImpl<SDValue> &PackedAddrs,
+ unsigned DimIdx, unsigned EndIdx,
+ unsigned NumGradients) {
SDLoc DL(Op);
for (unsigned I = DimIdx; I < EndIdx; I++) {
SDValue Addr = Op.getOperand(I);
@@ -5994,56 +6139,64 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
MVT VAddrVT =
Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
MVT VAddrScalarVT = VAddrVT.getScalarType();
- MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+ MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
VAddrScalarVT = VAddrVT.getScalarType();
+ MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
- if (IsA16 || IsG16) {
- if (IsA16) {
- if (!ST->hasA16()) {
- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit addresses\n");
- return Op;
- }
- if (!IsG16) {
- LLVM_DEBUG(
- dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
- "need 16 bit derivatives but got 32 bit derivatives\n");
- return Op;
- }
- } else if (!ST->hasG16()) {
+
+ if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
+ // 16 bit gradients are supported, but are tied to the A16 control
+ // so both gradients and addresses must be 16 bit
+ LLVM_DEBUG(
+ dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
+ "require 16 bit args for both gradients and addresses");
+ return Op;
+ }
+
+ if (IsA16) {
+ if (!ST->hasA16()) {
LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit derivatives\n");
+ "support 16 bit addresses\n");
return Op;
}
+ }
- if (BaseOpcode->Gradients && !IsA16) {
- if (!ST->hasG16()) {
- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
- "support 16 bit derivatives\n");
- return Op;
- }
- // Activate g16
- const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
- AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
- IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
- }
+ // We've dealt with incorrect input so we know that if IsA16, IsG16
+ // are set then we have to compress/pack operands (either address,
+ // gradient or both)
+ // In the case where a16 and gradients are tied (no G16 support) then we
+ // have already verified that both IsA16 and IsG16 are true
+ if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
+ // Activate g16
+ const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+ AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+ IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
+ }
- // Don't compress addresses for G16
- const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
- packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs,
- ArgOffset + Intr->GradientStart, PackEndIdx,
- Intr->NumGradients);
+ // Add gradients (packed or unpacked)
+ if (IsG16) {
+ // Pack the gradients
+ // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
+ packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
+ ArgOffset + Intr->GradientStart,
+ ArgOffset + Intr->CoordStart, Intr->NumGradients);
+ } else {
+ for (unsigned I = ArgOffset + Intr->GradientStart;
+ I < ArgOffset + Intr->CoordStart; I++)
+ VAddrs.push_back(Op.getOperand(I));
+ }
- if (!IsA16) {
- // Add uncompressed address
- for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
- VAddrs.push_back(Op.getOperand(I));
- }
+ // Add addresses (packed or unpacked)
+ if (IsA16) {
+ packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
+ ArgOffset + Intr->CoordStart, VAddrEnd,
+ 0 /* No gradients */);
} else {
- for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++)
+ // Add uncompressed address
+ for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
VAddrs.push_back(Op.getOperand(I));
}
@@ -6058,8 +6211,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
//
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
- bool UseNSA =
- ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
+ bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
+ VAddrs.size() >= 3 &&
+ VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
SDValue VAddr;
if (!UseNSA)
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
@@ -6120,19 +6274,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
}
- SDValue GLC;
- SDValue SLC;
- SDValue DLC;
- if (BaseOpcode->Atomic) {
- GLC = True; // TODO no-return optimization
- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
- DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr))
- return Op;
- } else {
- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),
- DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr))
- return Op;
- }
+ unsigned CPol = cast<ConstantSDNode>(
+ Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
+ if (BaseOpcode->Atomic)
+ CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
+ if (CPol & ~AMDGPU::CPol::ALL)
+ return Op;
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
@@ -6148,16 +6295,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
Ops.push_back(Unorm);
- if (IsGFX10Plus)
- Ops.push_back(DLC);
- Ops.push_back(GLC);
- Ops.push_back(SLC);
+ Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
if (IsGFX10Plus)
Ops.push_back(IsA16 ? True : False);
- Ops.push_back(TFE);
- Ops.push_back(LWE);
+ if (!Subtarget->hasGFX90AInsts()) {
+ Ops.push_back(TFE); //tfe
+ } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
+ report_fatal_error("TFE is not supported on this GPU");
+ }
+ Ops.push_back(LWE); // lwe
if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
@@ -6175,7 +6323,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
: AMDGPU::MIMGEncGfx10Default,
NumVDataDwords, NumVAddrDwords);
} else {
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->hasGFX90AInsts()) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ report_fatal_error(
+ "requested image instruction is not supported on this GPU");
+ }
+ if (Opcode == -1 &&
+ Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
@@ -6194,15 +6350,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 1> Elt;
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
- } else if (!BaseOpcode->Store) {
- return constructRetValue(DAG, NewNode,
- OrigResultTypes, IsTexFail,
- Subtarget->hasUnpackedD16VMem(), IsD16,
- DMaskLanes, NumVDataDwords, DL,
- *DAG.getContext());
}
-
- return SDValue(NewNode, 0);
+ if (BaseOpcode->Store)
+ return SDValue(NewNode, 0);
+ return constructRetValue(DAG, NewNode,
+ OrigResultTypes, IsTexFail,
+ Subtarget->hasUnpackedD16VMem(), IsD16,
+ DMaskLanes, NumVDataDwords, DL);
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
@@ -6448,11 +6602,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
- bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
- SDValue GLC;
- SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
- if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
- IsGFX10Plus ? &DLC : nullptr))
+ unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+ if (CPol & ~AMDGPU::CPol::ALL)
return Op;
return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
@@ -6607,6 +6758,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_alignbit:
return DAG.getNode(ISD::FSHR, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_perm:
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_reloc_constant: {
Module *M = const_cast<Module *>(MF.getFunction().getParent());
const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
@@ -6626,28 +6780,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
-// This function computes an appropriate offset to pass to
-// MachineMemOperand::setOffset() based on the offset inputs to
-// an intrinsic. If any of the offsets are non-contstant or
-// if VIndex is non-zero then this function returns 0. Otherwise,
-// it returns the sum of VOffset, SOffset, and Offset.
-static unsigned getBufferOffsetForMMO(SDValue VOffset,
- SDValue SOffset,
- SDValue Offset,
- SDValue VIndex = SDValue()) {
-
+/// Update \p MMO based on the offset inputs to an intrinsic.
+static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
+ SDValue SOffset, SDValue Offset,
+ SDValue VIndex = SDValue()) {
if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
- !isa<ConstantSDNode>(Offset))
- return 0;
+ !isa<ConstantSDNode>(Offset)) {
+ // The combined offset is not known to be constant, so we cannot represent
+ // it in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ return;
+ }
- if (VIndex) {
- if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
- return 0;
+ if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
+ !cast<ConstantSDNode>(VIndex)->isNullValue())) {
+ // The strided index component of the address is not known to be zero, so we
+ // cannot represent it in the MMO. Give up.
+ MMO->setValue((Value *)nullptr);
+ return;
}
- return cast<ConstantSDNode>(VOffset)->getSExtValue() +
- cast<ConstantSDNode>(SOffset)->getSExtValue() +
- cast<ConstantSDNode>(Offset)->getSExtValue();
+ MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() +
+ cast<ConstantSDNode>(SOffset)->getSExtValue() +
+ cast<ConstantSDNode>(Offset)->getSExtValue());
}
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
@@ -6670,13 +6825,21 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
M->getMemOperand());
}
+// Return a value to use for the idxen operand by examining the vindex operand.
+static unsigned getIdxEn(SDValue VIndex) {
+ if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex))
+ // No need to set idxen if vindex is known to be zero.
+ return VIndexC->getZExtValue() != 0;
+ return 1;
+}
+
SDValue
SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -6697,8 +6860,7 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
@@ -6811,9 +6973,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_load_format: {
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -6824,11 +6984,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
-
- unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
@@ -6836,7 +6992,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
EVT LoadVT = Op.getValueType();
if (LoadVT.getScalarType() == MVT::f16)
@@ -6868,7 +7024,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]);
return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
@@ -6888,8 +7044,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
- Ops[2]));
+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
@@ -6900,9 +7055,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -6983,9 +7136,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_buffer_atomic_fadd: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // vdata
@@ -6997,14 +7148,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
unsigned Opcode = 0;
switch (IntrID) {
@@ -7042,7 +7191,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
case Intrinsic::amdgcn_buffer_atomic_fadd:
- if (!Op.getValue(0).use_empty()) {
+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
DiagnosticInfoUnsupported
NoFpRet(DAG.getMachineFunction().getFunction(),
"return versions of fp atomics not supported",
@@ -7063,6 +7212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
@@ -7119,9 +7276,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(5));
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -7134,13 +7289,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -7161,7 +7314,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -7182,33 +7335,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
- Ops[4]));
+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- if (!Op.getValue(0).use_empty()) {
- DiagnosticInfoUnsupported
- NoFpRet(DAG.getMachineFunction().getFunction(),
- "return versions of fp atomics not supported",
- DL.getDebugLoc(), DS_Error);
- DAG.getContext()->diagnose(NoFpRet);
- return SDValue();
- }
- MemSDNode *M = cast<MemSDNode>(Op);
- SDValue Ops[] = {
- M->getOperand(0), // Chain
- M->getOperand(2), // Ptr
- M->getOperand(3) // Value
- };
-
- EVT VT = Op.getOperand(3).getValueType();
- return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
- DAG.getVTList(VT, MVT::Other), Ops,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
SDLoc DL(Op);
MemSDNode *M = cast<MemSDNode>(Op);
@@ -7224,6 +7355,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
assert(RayDir.getValueType() == MVT::v4f16 ||
RayDir.getValueType() == MVT::v4f32);
+ if (!Subtarget->hasGFX10_AEncoding()) {
+ emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
+ return SDValue();
+ }
+
bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
bool Is64 = NodePtr.getValueType() == MVT::i64;
unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
@@ -7279,7 +7415,55 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.setNodeMemRefs(NewNode, {MemRef});
return SDValue(NewNode, 0);
}
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
+ DiagnosticInfoUnsupported
+ NoFpRet(DAG.getMachineFunction().getFunction(),
+ "return versions of fp atomics not supported",
+ DL.getDebugLoc(), DS_Error);
+ DAG.getContext()->diagnose(NoFpRet);
+ return SDValue();
+ }
+ LLVM_FALLTHROUGH;
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ SDValue Ops[] = {
+ M->getOperand(0), // Chain
+ M->getOperand(2), // Ptr
+ M->getOperand(3) // Value
+ };
+ unsigned Opcode = 0;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_global_atomic_fadd:
+ case Intrinsic::amdgcn_flat_atomic_fadd: {
+ EVT VT = Op.getOperand(3).getValueType();
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ M->getMemOperand());
+ }
+ case Intrinsic::amdgcn_global_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmin: {
+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ break;
+ }
+ case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmax: {
+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ break;
+ }
+ default:
+ llvm_unreachable("unhandled atomic opcode");
+ }
+ return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
+ M->getVTList(), Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ }
default:
+
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
return lowerImage(Op, ImageDimIntr, DAG, true);
@@ -7448,9 +7632,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Chain,
VData, // vdata
@@ -7461,7 +7643,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(7), // offset
DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7486,7 +7668,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(1, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7511,7 +7693,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(0, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -7528,9 +7710,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
- unsigned IdxEn = 1;
- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
- IdxEn = Idx->getZExtValue() != 0;
+ unsigned IdxEn = getIdxEn(Op.getOperand(4));
SDValue Ops[] = {
Chain,
VData,
@@ -7542,15 +7722,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
- // We don't know the offset if vindex is non-zero, so clear it.
- if (IdxEn)
- Offset = 0;
+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(Offset);
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -7597,7 +7775,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
@@ -7644,8 +7822,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
- Ops[3]));
+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -7725,9 +7902,9 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
-unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
- SelectionDAG &DAG, SDValue *Offsets,
- Align Alignment) const {
+void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+ SelectionDAG &DAG, SDValue *Offsets,
+ Align Alignment) const {
SDLoc DL(CombinedOffset);
if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
uint32_t Imm = C->getZExtValue();
@@ -7737,7 +7914,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
- return SOffset + ImmOffset;
+ return;
}
}
if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
@@ -7750,13 +7927,12 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
- return 0;
+ return;
}
}
Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
- return 0;
}
// Handle 8 bit and 16 bit buffer loads
@@ -8263,8 +8439,8 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
// Returns immediate value for setting the F32 denorm mode when using the
// S_DENORM_MODE instruction.
-static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
- const SDLoc &SL, const GCNSubtarget *ST) {
+static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
+ const SDLoc &SL, const GCNSubtarget *ST) {
assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
? FP_DENORM_FLUSH_NONE
@@ -8794,18 +8970,20 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
}
// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cmdmask_b32 to be deserialized.
+// memory or argument and does not require v_cndmask_b32 to be deserialized.
static bool isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
- default: break;
+ default:
+ break;
case ISD::SETCC:
+ case AMDGPUISD::FP_CLASS:
+ return true;
case ISD::AND:
case ISD::OR:
case ISD::XOR:
- case AMDGPUISD::FP_CLASS:
- return true;
+ return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
}
return false;
}
@@ -9206,63 +9384,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return SDValue();
}
-// Instructions that will be lowered with a final instruction that zeros the
-// high result bits.
-// XXX - probably only need to list legal operations.
-static bool fp16SrcZerosHighBits(unsigned Opc) {
- switch (Opc) {
- case ISD::FADD:
- case ISD::FSUB:
- case ISD::FMUL:
- case ISD::FDIV:
- case ISD::FREM:
- case ISD::FMA:
- case ISD::FMAD:
- case ISD::FCANONICALIZE:
- case ISD::FP_ROUND:
- case ISD::UINT_TO_FP:
- case ISD::SINT_TO_FP:
- case ISD::FABS:
- // Fabs is lowered to a bit operation, but it's an and which will clear the
- // high bits anyway.
- case ISD::FSQRT:
- case ISD::FSIN:
- case ISD::FCOS:
- case ISD::FPOWI:
- case ISD::FPOW:
- case ISD::FLOG:
- case ISD::FLOG2:
- case ISD::FLOG10:
- case ISD::FEXP:
- case ISD::FEXP2:
- case ISD::FCEIL:
- case ISD::FTRUNC:
- case ISD::FRINT:
- case ISD::FNEARBYINT:
- case ISD::FROUND:
- case ISD::FFLOOR:
- case ISD::FMINNUM:
- case ISD::FMAXNUM:
- case AMDGPUISD::FRACT:
- case AMDGPUISD::CLAMP:
- case AMDGPUISD::COS_HW:
- case AMDGPUISD::SIN_HW:
- case AMDGPUISD::FMIN3:
- case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMED3:
- case AMDGPUISD::FMAD_FTZ:
- case AMDGPUISD::RCP:
- case AMDGPUISD::RSQ:
- case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::LDEXP:
- return true;
- default:
- // fcopysign, select and others may be lowered to 32-bit bit operations
- // which don't zero the high bits.
- return false;
- }
-}
-
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (!Subtarget->has16BitInsts() ||
@@ -9277,15 +9398,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
if (Src.getValueType() != MVT::i16)
return SDValue();
- // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
- // FIXME: It is not universally true that the high bits are zeroed on gfx9.
- if (Src.getOpcode() == ISD::BITCAST) {
- SDValue BCSrc = Src.getOperand(0);
- if (BCSrc.getValueType() == MVT::f16 &&
- fp16SrcZerosHighBits(BCSrc.getOpcode()))
- return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
- }
-
return SDValue();
}
@@ -9482,19 +9594,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// Could be anything.
return false;
- case ISD::BITCAST: {
+ case ISD::BITCAST:
+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+ case ISD::TRUNCATE: {
// Hack round the mess we make when legalizing extract_vector_elt
- SDValue Src = Op.getOperand(0);
- if (Src.getValueType() == MVT::i16 &&
- Src.getOpcode() == ISD::TRUNCATE) {
- SDValue TruncSrc = Src.getOperand(0);
+ if (Op.getValueType() == MVT::i16) {
+ SDValue TruncSrc = Op.getOperand(0);
if (TruncSrc.getValueType() == MVT::i32 &&
TruncSrc.getOpcode() == ISD::BITCAST &&
TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
}
}
-
return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
@@ -9527,6 +9638,45 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
llvm_unreachable("invalid operation");
}
+bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
+ unsigned MaxDepth) const {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MachineInstr *MI = MRI.getVRegDef(Reg);
+ unsigned Opcode = MI->getOpcode();
+
+ if (Opcode == AMDGPU::G_FCANONICALIZE)
+ return true;
+
+ if (Opcode == AMDGPU::G_FCONSTANT) {
+ auto F = MI->getOperand(1).getFPImm()->getValueAPF();
+ if (F.isNaN() && F.isSignaling())
+ return false;
+ return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);
+ }
+
+ if (MaxDepth == 0)
+ return false;
+
+ switch (Opcode) {
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE: {
+ if (Subtarget->supportsMinMaxDenormModes() ||
+ denormalsEnabledForType(MRI.getType(Reg), MF))
+ return true;
+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) {
+ if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1))
+ return false;
+ }
+ return true;
+ }
+ default:
+ return denormalsEnabledForType(MRI.getType(Reg), MF) &&
+ isKnownNeverSNaN(Reg, MRI);
+ }
+
+ llvm_unreachable("invalid operation");
+}
+
// Constant fold canonicalize.
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
@@ -9694,15 +9844,19 @@ SDValue SITargetLowering::performIntMed3ImmCombine(
}
// If there isn't a 16-bit med3 operation, convert to 32-bit.
- MVT NVT = MVT::i32;
- unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ if (VT == MVT::i16) {
+ MVT NVT = MVT::i32;
+ unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+ SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+ SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+ SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
- SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
- SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
- SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+ SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
+ }
- SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
- return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
+ return SDValue();
}
static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
@@ -10408,7 +10562,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
EVT VT = N->getValueType(0);
SDLoc SL(N);
- if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
+ if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
return SDValue();
// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
@@ -10791,7 +10945,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned NewDmask = 0;
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
- bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
+ bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
unsigned TFCLane = 0;
bool HasChain = Node->getNumValues() > 1;
@@ -11067,6 +11221,95 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
return Node;
}
+// Any MIMG instructions that use tfe or lwe require an initialization of the
+// result register that will be written in the case of a memory access failure.
+// The required code is also added to tie this init code to the result of the
+// img instruction.
+void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
+ MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
+ MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
+
+ if (!TFE && !LWE) // intersect_ray
+ return;
+
+ unsigned TFEVal = TFE ? TFE->getImm() : 0;
+ unsigned LWEVal = LWE->getImm();
+ unsigned D16Val = D16 ? D16->getImm() : 0;
+
+ if (!TFEVal && !LWEVal)
+ return;
+
+ // At least one of TFE or LWE are non-zero
+ // We have to insert a suitable initialization of the result value and
+ // tie this to the dest of the image instruction.
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ int DstIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
+
+ // Calculate which dword we have to initialize to 0.
+ MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
+
+ // check that dmask operand is found.
+ assert(MO_Dmask && "Expected dmask operand in instruction");
+
+ unsigned dmask = MO_Dmask->getImm();
+ // Determine the number of active lanes taking into account the
+ // Gather4 special case
+ unsigned ActiveLanes = TII->isGather4(MI) ? 4 : countPopulation(dmask);
+
+ bool Packed = !Subtarget->hasUnpackedD16VMem();
+
+ unsigned InitIdx =
+ D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
+
+ // Abandon attempt if the dst size isn't large enough
+ // - this is in fact an error but this is picked up elsewhere and
+ // reported correctly.
+ uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
+ if (DstSize < InitIdx)
+ return;
+
+ // Create a register for the intialization value.
+ Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ unsigned NewDst = 0; // Final initialized value will be in here
+
+ // If PRTStrictNull feature is enabled (the default) then initialize
+ // all the result registers to 0, otherwise just the error indication
+ // register (VGPRn+1)
+ unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
+ unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
+
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
+ for (; SizeLeft; SizeLeft--, CurrIdx++) {
+ NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
+ // Initialize dword
+ Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
+ .addImm(0);
+ // Insert into the super-reg
+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
+ .addReg(PrevDst)
+ .addReg(SubReg)
+ .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
+
+ PrevDst = NewDst;
+ }
+
+ // Add as an implicit operand
+ MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
+
+ // Tie the just added implicit operand to the dst
+ MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
+}
+
/// Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
@@ -11114,10 +11357,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
if (!Node->hasAnyUseOfValue(0)) {
- int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::glc1);
- if (Glc1Idx != -1)
- MI.RemoveOperand(Glc1Idx);
+ int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::cpol);
+ if (CPolIdx != -1) {
+ MachineOperand &CPol = MI.getOperand(CPolIdx);
+ CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC);
+ }
MI.RemoveOperand(0);
MI.setDesc(TII->get(NoRetAtomicOp));
return;
@@ -11148,6 +11393,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
return;
}
+
+ if (TII->isMIMG(MI) && !MI.mayStore())
+ AddIMGInit(MI);
}
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
@@ -11226,9 +11474,11 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
//===----------------------------------------------------------------------===//
std::pair<unsigned, const TargetRegisterClass *>
-SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
StringRef Constraint,
MVT VT) const {
+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
+
const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
const unsigned BitWidth = VT.getSizeInBits();
@@ -11257,7 +11507,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::VGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);
+ RC = TRI->getVGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
@@ -11271,7 +11521,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::AGPR_32RegClass;
break;
default:
- RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);
+ RC = TRI->getAGPRClassForBitWidth(BitWidth);
if (!RC)
return std::make_pair(0U, nullptr);
break;
@@ -11444,6 +11694,47 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
return false;
}
+static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
+ switch (UnalignedClassID) {
+ case AMDGPU::VReg_64RegClassID:
+ return AMDGPU::VReg_64_Align2RegClassID;
+ case AMDGPU::VReg_96RegClassID:
+ return AMDGPU::VReg_96_Align2RegClassID;
+ case AMDGPU::VReg_128RegClassID:
+ return AMDGPU::VReg_128_Align2RegClassID;
+ case AMDGPU::VReg_160RegClassID:
+ return AMDGPU::VReg_160_Align2RegClassID;
+ case AMDGPU::VReg_192RegClassID:
+ return AMDGPU::VReg_192_Align2RegClassID;
+ case AMDGPU::VReg_224RegClassID:
+ return AMDGPU::VReg_224_Align2RegClassID;
+ case AMDGPU::VReg_256RegClassID:
+ return AMDGPU::VReg_256_Align2RegClassID;
+ case AMDGPU::VReg_512RegClassID:
+ return AMDGPU::VReg_512_Align2RegClassID;
+ case AMDGPU::VReg_1024RegClassID:
+ return AMDGPU::VReg_1024_Align2RegClassID;
+ case AMDGPU::AReg_64RegClassID:
+ return AMDGPU::AReg_64_Align2RegClassID;
+ case AMDGPU::AReg_96RegClassID:
+ return AMDGPU::AReg_96_Align2RegClassID;
+ case AMDGPU::AReg_128RegClassID:
+ return AMDGPU::AReg_128_Align2RegClassID;
+ case AMDGPU::AReg_160RegClassID:
+ return AMDGPU::AReg_160_Align2RegClassID;
+ case AMDGPU::AReg_192RegClassID:
+ return AMDGPU::AReg_192_Align2RegClassID;
+ case AMDGPU::AReg_256RegClassID:
+ return AMDGPU::AReg_256_Align2RegClassID;
+ case AMDGPU::AReg_512RegClassID:
+ return AMDGPU::AReg_512_Align2RegClassID;
+ case AMDGPU::AReg_1024RegClassID:
+ return AMDGPU::AReg_1024_Align2RegClassID;
+ default:
+ return -1;
+ }
+}
+
// Figure out which registers should be reserved for stack access. Only after
// the function is legalized do we know all of the non-spill stack objects or if
// calls are present.
@@ -11452,6 +11743,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.
@@ -11474,7 +11766,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
Info->limitOccupancy(MF);
if (ST.isWave32() && !MF.empty()) {
- const SIInstrInfo *TII = ST.getInstrInfo();
for (auto &MBB : MF) {
for (auto &MI : MBB) {
TII->fixImplicitOperands(MI);
@@ -11482,13 +11773,30 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
}
}
+ // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
+ // classes if required. Ideally the register class constraints would differ
+ // per-subtarget, but there's no easy way to achieve that right now. This is
+ // not a problem for VGPRs because the correctly aligned VGPR class is implied
+ // from using them as the register class for legal types.
+ if (ST.needsAlignedVGPRs()) {
+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+ const Register Reg = Register::index2VirtReg(I);
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ if (!RC)
+ continue;
+ int NewClassID = getAlignedAGPRClassID(RC->getID());
+ if (NewClassID != -1)
+ MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
+ }
+ }
+
TargetLoweringBase::finalizeLowering(MF);
// Allocate a VGPR for future SGPR Spill if
// "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
// FIXME: We won't need this hack if we split SGPR allocation from VGPR
- if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill &&
- !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects())
+ if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
+ !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
Info->reserveVGPRforSGPRSpills(MF);
}
@@ -11690,8 +11998,37 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ case AMDGPUISD::ATOMIC_CMP_SWAP:
+ case AMDGPUISD::ATOMIC_INC:
+ case AMDGPUISD::ATOMIC_DEC:
+ case AMDGPUISD::ATOMIC_LOAD_FMIN:
+ case AMDGPUISD::ATOMIC_LOAD_FMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_SWAP:
+ case AMDGPUISD::BUFFER_ATOMIC_ADD:
+ case AMDGPUISD::BUFFER_ATOMIC_SUB:
+ case AMDGPUISD::BUFFER_ATOMIC_SMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_UMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_SMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_UMAX:
+ case AMDGPUISD::BUFFER_ATOMIC_AND:
+ case AMDGPUISD::BUFFER_ATOMIC_OR:
+ case AMDGPUISD::BUFFER_ATOMIC_XOR:
+ case AMDGPUISD::BUFFER_ATOMIC_INC:
+ case AMDGPUISD::BUFFER_ATOMIC_DEC:
+ case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
+ case AMDGPUISD::BUFFER_ATOMIC_CSUB:
+ case AMDGPUISD::BUFFER_ATOMIC_FADD:
+ case AMDGPUISD::BUFFER_ATOMIC_FMIN:
+ case AMDGPUISD::BUFFER_ATOMIC_FMAX:
+ // Target-specific read-modify-write atomics are sources of divergence.
+ return true;
+ default:
+ if (auto *A = dyn_cast<AtomicSDNode>(N)) {
+ // Generic read-modify-write atomics are sources of divergence.
+ return A->readMem() && A->writeMem();
+ }
+ return false;
}
- return false;
}
bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
@@ -11707,6 +12044,19 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
}
}
+bool SITargetLowering::denormalsEnabledForType(LLT Ty,
+ MachineFunction &MF) const {
+ switch (Ty.getScalarSizeInBits()) {
+ case 32:
+ return hasFP32Denormals(MF);
+ case 64:
+ case 16:
+ return hasFP64FP16Denormals(MF);
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
@@ -11745,24 +12095,57 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (Ty->isHalfTy())
return AtomicExpansionKind::None;
- if (!Ty->isFloatTy())
+ if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;
- // TODO: Do have these for flat. Older targets also had them for buffers.
unsigned AS = RMW->getPointerAddressSpace();
- if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
- if (!fpModeMatchesGlobalFPAtomicMode(RMW))
+ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
+ Subtarget->hasAtomicFaddInsts()) {
+ // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
+ // floating point atomic instructions. May generate more efficient code,
+ // but may not respect rounding and denormal modes, and may give incorrect
+ // results for certain memory destinations.
+ if (RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsString() != "true")
+ return AtomicExpansionKind::CmpXChg;
+
+ if (Subtarget->hasGFX90AInsts()) {
+ if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS)
+ return AtomicExpansionKind::CmpXChg;
+
+ auto SSID = RMW->getSyncScopeID();
+ if (SSID == SyncScope::System ||
+ SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
+ return AtomicExpansionKind::CmpXChg;
+
+ return AtomicExpansionKind::None;
+ }
+
+ if (AS == AMDGPUAS::FLAT_ADDRESS)
return AtomicExpansionKind::CmpXChg;
- return RMW->use_empty() ? AtomicExpansionKind::None :
- AtomicExpansionKind::CmpXChg;
+ return RMW->use_empty() ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
}
// DS FP atomics do repect the denormal mode, but the rounding mode is fixed
// to round-to-nearest-even.
- return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
- AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
+ // The only exception is DS_ADD_F64 which never flushes regardless of mode.
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {
+ if (!Ty->isDoubleTy())
+ return AtomicExpansionKind::None;
+
+ return (fpModeMatchesGlobalFPAtomicMode(RMW) ||
+ RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsString() == "true")
+ ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
+ }
+
+ return AtomicExpansionKind::CmpXChg;
}
default:
break;
@@ -11872,10 +12255,11 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
}
-std::pair<int, MVT>
+std::pair<InstructionCost, MVT>
SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
Type *Ty) const {
- auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> Cost =
+ TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
auto Size = DL.getTypeSizeInBits(Ty);
// Maximum load or store can handle 8 dwords for scalar and 4 for
// vector ALU. Let's assume anything above 8 dwords is expensive