src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-07-29 20:15:26 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2021-07-29 20:15:26 +0000
commit	344a3780b2e33f6ca763666c380202b18aab72a3 (patch)
tree	f0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AMDGPU/SIISelLowering.cpp
parent	b60736ec1405bb0a8dd40989f67ef4c93da068ab (diff)
download	src-344a3780b2e33f6ca763666c380202b18aab72a3.tar.gz src-344a3780b2e33f6ca763666c380202b18aab72a3.zip

Vendor import of llvm-project main 88e66fa60ae5, the last commit beforevendor/llvm-project/llvmorg-13-init-16847-g88e66fa60ae5 vendor/llvm-project/llvmorg-12.0.1-rc2-0-ge7dac564cd0e vendor/llvm-project/llvmorg-12.0.1-0-gfed41342a82f

the upstream release/13.x branch was created.

Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')

-rw-r--r--

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

1218

1 files changed, 801 insertions, 417 deletions

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 839437b5e3f8..d98acfc6c532 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -19,11 +19,13 @@

#include "SIRegisterInfo.h"

#include "llvm/ADT/Statistic.h"

#include "llvm/Analysis/LegacyDivergenceAnalysis.h"

+#include "llvm/BinaryFormat/ELF.h"

#include "llvm/CodeGen/Analysis.h"

#include "llvm/CodeGen/FunctionLoweringInfo.h"

#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"

#include "llvm/CodeGen/MachineLoopInfo.h"

#include "llvm/IR/DiagnosticInfo.h"

+#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/IntrinsicsAMDGPU.h"

#include "llvm/IR/IntrinsicsR600.h"

#include "llvm/Support/CommandLine.h"

@@ -80,36 +82,49 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);

addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);

- addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);

addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);

- addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);

+ const SIRegisterInfo *TRI = STI.getRegisterInfo();

+ const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();

+ addRegisterClass(MVT::f64, V64RegClass);

+ addRegisterClass(MVT::v2f32, V64RegClass);

addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);

- addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);

+ addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));

addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);

addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);

addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);

- addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);

+ addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));

addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);

- addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);

+ addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));

+ addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);

+ addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));

+ addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);

+ addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));

+ addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);

+ addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));

addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);

- addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);

+ addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));

addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);

- addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);

+ addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));

addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);

- addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);

+ addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));

addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);

- addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);

+ addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));

addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);

- addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);

+ addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));

if (Subtarget->has16BitInsts()) {

addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);

@@ -123,7 +138,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

}

addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);

- addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);

+ addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));

computeRegisterProperties(Subtarget->getRegisterInfo());

@@ -139,6 +154,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::LOAD, MVT::v3i32, Custom);

setOperationAction(ISD::LOAD, MVT::v4i32, Custom);

setOperationAction(ISD::LOAD, MVT::v5i32, Custom);

+ setOperationAction(ISD::LOAD, MVT::v6i32, Custom);

+ setOperationAction(ISD::LOAD, MVT::v7i32, Custom);

setOperationAction(ISD::LOAD, MVT::v8i32, Custom);

setOperationAction(ISD::LOAD, MVT::v16i32, Custom);

setOperationAction(ISD::LOAD, MVT::i1, Custom);

@@ -148,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::STORE, MVT::v3i32, Custom);

setOperationAction(ISD::STORE, MVT::v4i32, Custom);

setOperationAction(ISD::STORE, MVT::v5i32, Custom);

+ setOperationAction(ISD::STORE, MVT::v6i32, Custom);

+ setOperationAction(ISD::STORE, MVT::v7i32, Custom);

setOperationAction(ISD::STORE, MVT::v8i32, Custom);

setOperationAction(ISD::STORE, MVT::v16i32, Custom);

setOperationAction(ISD::STORE, MVT::i1, Custom);

@@ -170,6 +189,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);

+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);

+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);

@@ -197,8 +218,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);

+ setOperationAction(ISD::TRUNCATE, MVT::v3i32, Expand);

+ setOperationAction(ISD::FP_ROUND, MVT::v3f32, Expand);

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);

setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);

+ setOperationAction(ISD::TRUNCATE, MVT::v5i32, Expand);

+ setOperationAction(ISD::FP_ROUND, MVT::v5f32, Expand);

+ setOperationAction(ISD::TRUNCATE, MVT::v6i32, Expand);

+ setOperationAction(ISD::FP_ROUND, MVT::v6f32, Expand);

+ setOperationAction(ISD::TRUNCATE, MVT::v7i32, Expand);

+ setOperationAction(ISD::FP_ROUND, MVT::v7f32, Expand);

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);

setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);

@@ -239,6 +268,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

// with > 4 elements.

for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,

MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,

+ MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,

MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,

MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {

for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {

@@ -249,10 +279,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

case ISD::BITCAST:

case ISD::EXTRACT_VECTOR_ELT:

case ISD::INSERT_VECTOR_ELT:

- case ISD::INSERT_SUBVECTOR:

case ISD::EXTRACT_SUBVECTOR:

case ISD::SCALAR_TO_VECTOR:

break;

+ case ISD::INSERT_SUBVECTOR:

case ISD::CONCAT_VECTORS:

setOperationAction(Op, VT, Custom);

break;

@@ -284,6 +314,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);

}

+ for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {

+ setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);

+ AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);

+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);

+ AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);

+ setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);

+ AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);

+ setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);

+ AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);

+ }

for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {

setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);

AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);

@@ -336,17 +380,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

// Avoid stack access for these.

// TODO: Generalize to more vector types.

+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);

+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);

- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);

- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);

- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);

- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);

@@ -362,9 +403,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);

- // Deal with vec5 vector operations when widened to vec8.

+ // Deal with vec5/6/7 vector operations when widened to vec8.

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);

+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6i32, Custom);

+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v6f32, Custom);

+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7i32, Custom);

+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v7f32, Custom);

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);

@@ -384,6 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

}

setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);

+ setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);

// FIXME: This should be narrowed to i32, but that only happens if i64 is

// illegal.

@@ -525,8 +571,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);

AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);

- setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);

- setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Custom);

// F16 - Constant Actions.

setOperationAction(ISD::ConstantFP, MVT::f16, Legal);

@@ -718,6 +764,19 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,

setOperationAction(ISD::FEXP, MVT::v2f16, Custom);

setOperationAction(ISD::SELECT, MVT::v4i16, Custom);

setOperationAction(ISD::SELECT, MVT::v4f16, Custom);

+ if (Subtarget->hasPackedFP32Ops()) {

+ setOperationAction(ISD::FADD, MVT::v2f32, Legal);

+ setOperationAction(ISD::FMUL, MVT::v2f32, Legal);

+ setOperationAction(ISD::FMA, MVT::v2f32, Legal);

+ setOperationAction(ISD::FNEG, MVT::v2f32, Legal);

+ for (MVT VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32 }) {

+ setOperationAction(ISD::FADD, VT, Custom);

+ setOperationAction(ISD::FMUL, VT, Custom);

+ setOperationAction(ISD::FMA, VT, Custom);

+ }

}

setOperationAction(ISD::FNEG, MVT::v4f16, Custom);

@@ -1128,17 +1187,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

MachineMemOperand::MOVolatile;

return true;

}

- case Intrinsic::amdgcn_global_atomic_fadd: {

- Info.opc = ISD::INTRINSIC_W_CHAIN;

- Info.memVT = MVT::getVT(CI.getType());

- Info.ptrVal = CI.getOperand(0);

- Info.align.reset();

- Info.flags = MachineMemOperand::MOLoad |

- MachineMemOperand::MOStore |

- MachineMemOperand::MODereferenceable |

- MachineMemOperand::MOVolatile;

- return true;

- }

case Intrinsic::amdgcn_image_bvh_intersect_ray: {

SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

Info.opc = ISD::INTRINSIC_W_CHAIN;

@@ -1150,6 +1198,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

MachineMemOperand::MODereferenceable;

return true;

}

+ case Intrinsic::amdgcn_global_atomic_fadd:

+ case Intrinsic::amdgcn_global_atomic_fmin:

+ case Intrinsic::amdgcn_global_atomic_fmax:

+ case Intrinsic::amdgcn_flat_atomic_fadd:

+ case Intrinsic::amdgcn_flat_atomic_fmin:

+ case Intrinsic::amdgcn_flat_atomic_fmax: {

+ Info.opc = ISD::INTRINSIC_W_CHAIN;

+ Info.memVT = MVT::getVT(CI.getType());

+ Info.ptrVal = CI.getOperand(0);

+ Info.align.reset();

+ Info.flags = MachineMemOperand::MOLoad |

+ MachineMemOperand::MOStore |

+ MachineMemOperand::MODereferenceable |

+ MachineMemOperand::MOVolatile;

+ return true;

+ }

case Intrinsic::amdgcn_ds_gws_init:

case Intrinsic::amdgcn_ds_gws_barrier:

case Intrinsic::amdgcn_ds_gws_sema_v:

@@ -1191,6 +1255,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,

case Intrinsic::amdgcn_ds_fmin:

case Intrinsic::amdgcn_ds_fmax:

case Intrinsic::amdgcn_global_atomic_fadd:

+ case Intrinsic::amdgcn_flat_atomic_fadd:

+ case Intrinsic::amdgcn_flat_atomic_fmin:

+ case Intrinsic::amdgcn_flat_atomic_fmax:

case Intrinsic::amdgcn_global_atomic_csub: {

Value *Ptr = II->getArgOperand(0);

AccessTy = II->getType();

@@ -1210,9 +1277,9 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {

}

return AM.Scale == 0 &&

- (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(

- AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS,

- /*Signed=*/false));

+ (AM.BaseOffs == 0 ||

+ Subtarget->getInstrInfo()->isLegalFLATOffset(

+ AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));

}

bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {

@@ -1220,7 +1287,7 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {

return AM.Scale == 0 &&

(AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(

AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,

- /*Signed=*/true));

+ SIInstrFlags::FlatGlobal));

if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {

// Assume the we will use FLAT for all global memory accesses

@@ -1385,10 +1452,15 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(

return true;

}

+ // Either, the alignment requirements are "enabled", or there is an

+ // unaligned LDS access related hardware bug though alignment requirements

+ // are "disabled". In either case, we need to check for proper alignment

+ // requirements.

+ //

if (Size == 64) {

- // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte

- // aligned, 8 byte access in a single operation using ds_read2/write2_b32

- // with adjacent offsets.

+ // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we

+ // can do a 4 byte aligned, 8 byte access in a single operation using

+ // ds_read2/write2_b32 with adjacent offsets.

bool AlignedBy4 = Alignment >= Align(4);

if (IsFast)

*IsFast = AlignedBy4;

@@ -1396,22 +1468,23 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(

return AlignedBy4;

}

if (Size == 96) {

- // ds_read/write_b96 require 16-byte alignment on gfx8 and older.

- bool Aligned = Alignment >= Align(16);

+ // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on

+ // gfx8 and older.

+ bool AlignedBy16 = Alignment >= Align(16);

if (IsFast)

- *IsFast = Aligned;

+ *IsFast = AlignedBy16;

- return Aligned;

+ return AlignedBy16;

}

if (Size == 128) {

- // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we

- // can do a 8 byte aligned, 16 byte access in a single operation using

- // ds_read2/write2_b64.

- bool Aligned = Alignment >= Align(8);

+ // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on

+ // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a

+ // single operation using ds_read2/write2_b64.

+ bool AlignedBy8 = Alignment >= Align(8);

if (IsFast)

- *IsFast = Aligned;

+ *IsFast = AlignedBy8;

- return Aligned;

+ return AlignedBy8;

}

@@ -1467,8 +1540,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(

}

bool SITargetLowering::allowsMisalignedMemoryAccesses(

- EVT VT, unsigned AddrSpace, unsigned Alignment,

- MachineMemOperand::Flags Flags, bool *IsFast) const {

+ EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,

+ bool *IsFast) const {

if (IsFast)

*IsFast = false;

@@ -1482,7 +1555,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(

}

return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,

- Align(Alignment), Flags, IsFast);

+ Alignment, Flags, IsFast);

}

EVT SITargetLowering::getOptimalMemOpType(

@@ -1535,8 +1608,8 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {

TargetLoweringBase::LegalizeTypeAction

SITargetLowering::getPreferredVectorAction(MVT VT) const {

- int NumElts = VT.getVectorNumElements();

- if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))

+ if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

+ VT.getScalarType().bitsLE(MVT::i16))

return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;

return TargetLoweringBase::getPreferredVectorAction(VT);

}

@@ -1799,23 +1872,37 @@ void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,

MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

CCInfo.AllocateReg(Reg);

- Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));

+ unsigned Mask = (Subtarget->hasPackedTID() &&

+ Info.hasWorkItemIDY()) ? 0x3ff : ~0u;

+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));

}

if (Info.hasWorkItemIDY()) {

- Register Reg = AMDGPU::VGPR1;

- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

+ assert(Info.hasWorkItemIDX());

+ if (Subtarget->hasPackedTID()) {

+ Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,

+ 0x3ff << 10));

+ } else {

+ unsigned Reg = AMDGPU::VGPR1;

+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

- CCInfo.AllocateReg(Reg);

- Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));

+ CCInfo.AllocateReg(Reg);

+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));

+ }

}

if (Info.hasWorkItemIDZ()) {

- Register Reg = AMDGPU::VGPR2;

- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

+ assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());

+ if (Subtarget->hasPackedTID()) {

+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,

+ 0x3ff << 20));

+ } else {

+ unsigned Reg = AMDGPU::VGPR2;

+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);

- CCInfo.AllocateReg(Reg);

- Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));

+ CCInfo.AllocateReg(Reg);

+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));

+ }

}

@@ -1865,12 +1952,32 @@ static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,

return ArgDescriptor::createRegister(Reg);

}

-static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {

- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);

+// If this has a fixed position, we still should allocate the register in the

+// CCInfo state. Technically we could get away with this for values passed

+// outside of the normal argument range.

+static void allocateFixedSGPRInputImpl(CCState &CCInfo,

+ const TargetRegisterClass *RC,

+ MCRegister Reg) {

+ Reg = CCInfo.AllocateReg(Reg);

+ assert(Reg != AMDGPU::NoRegister);

+ MachineFunction &MF = CCInfo.getMachineFunction();

+ MF.addLiveIn(Reg, RC);

+static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {

+ if (Arg) {

+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,

+ Arg.getRegister());

+ } else

+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);

}

-static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {

- return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);

+static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {

+ if (Arg) {

+ allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,

+ Arg.getRegister());

+ } else

+ Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);

}

/// Allocate implicit function VGPR arguments at the end of allocated user

@@ -1919,29 +2026,29 @@ void SITargetLowering::allocateSpecialInputSGPRs(

// TODO: Unify handling with private memory pointers.

if (Info.hasDispatchPtr())

- ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);

+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);

if (Info.hasQueuePtr())

- ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);

+ allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);

// Implicit arg ptr takes the place of the kernarg segment pointer. This is a

// constant offset from the kernarg segment.

if (Info.hasImplicitArgPtr())

- ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);

+ allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);

if (Info.hasDispatchID())

- ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);

+ allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);

// flat_scratch_init is not applicable for non-kernel functions.

if (Info.hasWorkGroupIDX())

- ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);

+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);

if (Info.hasWorkGroupIDY())

- ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);

+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);

if (Info.hasWorkGroupIDZ())

- ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);

+ allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);

}

// Allocate special inputs passed in user SGPRs.

@@ -2203,6 +2310,8 @@ SDValue SITargetLowering::LowerFormalArguments(

return DAG.getEntryNode();

}

+ Info->allocateModuleLDSGlobal(Fn.getParent());

SmallVector<ISD::InputArg, 16> Splits;

SmallVector<CCValAssign, 16> ArgLocs;

BitVector Skipped(Ins.size());

@@ -2767,6 +2876,7 @@ static bool canGuaranteeTCO(CallingConv::ID CC) {

static bool mayTailCallThisCC(CallingConv::ID CC) {

switch (CC) {

case CallingConv::C:

+ case CallingConv::AMDGPU_Gfx:

return true;

default:

return canGuaranteeTCO(CC);

@@ -2781,6 +2891,11 @@ bool SITargetLowering::isEligibleForTailCallOptimization(

if (!mayTailCallThisCC(CalleeCC))

return false;

+ // For a divergent call target, we need to do a waterfall loop over the

+ // possible callees which precludes us from using a simple jump.

+ if (Callee->isDivergent())

+ return false;

MachineFunction &MF = DAG.getMachineFunction();

const Function &CallerF = MF.getFunction();

CallingConv::ID CallerCC = CallerF.getCallingConv();

@@ -2888,12 +3003,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,

if (!CLI.CB)

report_fatal_error("unsupported libcall legalization");

- if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&

- !CLI.CB->getCalledFunction() && CallConv != CallingConv::AMDGPU_Gfx) {

- return lowerUnhandledCall(CLI, InVals,

- "unsupported indirect call to function ");

- }

if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {

return lowerUnhandledCall(CLI, InVals,

"unsupported required tail call to function ");

@@ -3054,7 +3163,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,

// locations, which are supposed to be immutable?

Chain = addTokenForArgument(Chain, DAG, MFI, FI);

} else {

- DstAddr = PtrOff;

+ // Stores to the argument stack area are relative to the stack pointer.

+ SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),

+ MVT::i32);

+ DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);

DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);

Alignment =

commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);

@@ -4150,11 +4262,35 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(

return BB;

}

case AMDGPU::DS_GWS_INIT:

- case AMDGPU::DS_GWS_SEMA_V:

case AMDGPU::DS_GWS_SEMA_BR:

+ case AMDGPU::DS_GWS_BARRIER:

+ if (Subtarget->needsAlignedVGPRs()) {

+ // Add implicit aligned super-reg to force alignment on the data operand.

+ const DebugLoc &DL = MI.getDebugLoc();

+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();

+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

+ MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::data0);

+ Register DataReg = Op->getReg();

+ bool IsAGPR = TRI->isAGPR(MRI, DataReg);

+ Register Undef = MRI.createVirtualRegister(

+ IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);

+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), Undef);

+ Register NewVR =

+ MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass

+ : &AMDGPU::VReg_64_Align2RegClass);

+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), NewVR)

+ .addReg(DataReg, 0, Op->getSubReg())

+ .addImm(AMDGPU::sub0)

+ .addReg(Undef)

+ .addImm(AMDGPU::sub1);

+ Op->setReg(NewVR);

+ Op->setSubReg(AMDGPU::sub0);

+ MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));

+ }

+ LLVM_FALLTHROUGH;

+ case AMDGPU::DS_GWS_SEMA_V:

case AMDGPU::DS_GWS_SEMA_P:

case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:

- case AMDGPU::DS_GWS_BARRIER:

// A s_waitcnt 0 is required to be the instruction immediately following.

if (getSubtarget()->hasGWSAutoReplay()) {

bundleInstWithWaitcnt(MI);

@@ -4360,7 +4496,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,

SelectionDAG &DAG) const {

unsigned Opc = Op.getOpcode();

EVT VT = Op.getValueType();

- assert(VT == MVT::v4i16 || VT == MVT::v4f16);

+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||

+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);

SDValue Lo0, Hi0;

std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);

@@ -4381,7 +4518,8 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,

SelectionDAG &DAG) const {

unsigned Opc = Op.getOpcode();

EVT VT = Op.getValueType();

- assert(VT == MVT::v4i16 || VT == MVT::v4f16);

+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||

+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);

SDValue Lo0, Hi0;

std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);

@@ -4456,6 +4594,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

return lowerFMINNUM_FMAXNUM(Op, DAG);

case ISD::FMA:

return splitTernaryVectorOp(Op, DAG);

+ case ISD::FP_TO_SINT:

+ case ISD::FP_TO_UINT:

+ return LowerFP_TO_INT(Op, DAG);

case ISD::SHL:

case ISD::SRA:

case ISD::SRL:

@@ -5092,12 +5233,35 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {

}

SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {

+ if (!Subtarget->isTrapHandlerEnabled() ||

+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)

+ return lowerTrapEndpgm(Op, DAG);

+ if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {

+ switch (*HsaAbiVer) {

+ case ELF::ELFABIVERSION_AMDGPU_HSA_V2:

+ case ELF::ELFABIVERSION_AMDGPU_HSA_V3:

+ return lowerTrapHsaQueuePtr(Op, DAG);

+ case ELF::ELFABIVERSION_AMDGPU_HSA_V4:

+ return Subtarget->supportsGetDoorbellID() ?

+ lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);

+ }

+ llvm_unreachable("Unknown trap handler");

+SDValue SITargetLowering::lowerTrapEndpgm(

+ SDValue Op, SelectionDAG &DAG) const {

SDLoc SL(Op);

SDValue Chain = Op.getOperand(0);

+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);

- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||

- !Subtarget->isTrapHandlerEnabled())

- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);

+SDValue SITargetLowering::lowerTrapHsaQueuePtr(

+ SDValue Op, SelectionDAG &DAG) const {

+ SDLoc SL(Op);

+ SDValue Chain = Op.getOperand(0);

MachineFunction &MF = DAG.getMachineFunction();

SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

@@ -5108,22 +5272,37 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {

SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);

SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,

QueuePtr, SDValue());

+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);

SDValue Ops[] = {

ToReg,

- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),

+ DAG.getTargetConstant(TrapID, SL, MVT::i16),

SGPR01,

ToReg.getValue(1)

};

return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);

}

+SDValue SITargetLowering::lowerTrapHsa(

+ SDValue Op, SelectionDAG &DAG) const {

+ SDLoc SL(Op);

+ SDValue Chain = Op.getOperand(0);

+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);

+ SDValue Ops[] = {

+ Chain,

+ DAG.getTargetConstant(TrapID, SL, MVT::i16)

+ };

+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);

SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {

SDLoc SL(Op);

SDValue Chain = Op.getOperand(0);

MachineFunction &MF = DAG.getMachineFunction();

- if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||

- !Subtarget->isTrapHandlerEnabled()) {

+ if (!Subtarget->isTrapHandlerEnabled() ||

+ Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {

DiagnosticInfoUnsupported NoTrap(MF.getFunction(),

"debugtrap handler not supported",

Op.getDebugLoc(),

@@ -5133,9 +5312,10 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {

return Chain;

}

+ uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);

SDValue Ops[] = {

Chain,

- DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)

+ DAG.getTargetConstant(TrapID, SL, MVT::i16)

};

return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);

}

@@ -5666,23 +5846,10 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,

ArrayRef<SDValue> Elts) {

assert(!Elts.empty());

MVT Type;

- unsigned NumElts;

- if (Elts.size() == 1) {

- Type = MVT::f32;

- NumElts = 1;

- } else if (Elts.size() == 2) {

- Type = MVT::v2f32;

- NumElts = 2;

- } else if (Elts.size() == 3) {

- Type = MVT::v3f32;

- NumElts = 3;

- } else if (Elts.size() <= 4) {

- Type = MVT::v4f32;

- NumElts = 4;

- } else if (Elts.size() <= 8) {

- Type = MVT::v8f32;

- NumElts = 8;

+ unsigned NumElts = Elts.size();

+ if (NumElts <= 8) {

+ Type = MVT::getVectorVT(MVT::f32, NumElts);

} else {

assert(Elts.size() <= 16);

Type = MVT::v16f32;

@@ -5704,28 +5871,6 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,

return DAG.getBuildVector(Type, DL, VecElts);

}

-static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,

- SDValue *GLC, SDValue *SLC, SDValue *DLC) {

- auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());

- uint64_t Value = CachePolicyConst->getZExtValue();

- SDLoc DL(CachePolicy);

- if (GLC) {

- *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);

- Value &= ~(uint64_t)0x1;

- }

- if (SLC) {

- *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);

- Value &= ~(uint64_t)0x2;

- }

- if (DLC) {

- *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);

- Value &= ~(uint64_t)0x4;

- }

- return Value == 0;

static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,

SDValue Src, int ExtraElts) {

EVT SrcVT = Src.getValueType();

@@ -5752,7 +5897,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,

ArrayRef<EVT> ResultTypes,

bool IsTexFail, bool Unpacked, bool IsD16,

int DMaskPop, int NumVDataDwords,

- const SDLoc &DL, LLVMContext &Context) {

+ const SDLoc &DL) {

// Determine the required return type. This is the same regardless of IsTexFail flag

EVT ReqRetVT = ResultTypes[0];

int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;

@@ -5835,11 +5980,11 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,

return Value == 0;

}

-static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,

- MVT PackVectorVT,

- SmallVectorImpl<SDValue> &PackedAddrs,

- unsigned DimIdx, unsigned EndIdx,

- unsigned NumGradients) {

+static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,

+ MVT PackVectorVT,

+ SmallVectorImpl<SDValue> &PackedAddrs,

+ unsigned DimIdx, unsigned EndIdx,

+ unsigned NumGradients) {

SDLoc DL(Op);

for (unsigned I = DimIdx; I < EndIdx; I++) {

SDValue Addr = Op.getOperand(I);

@@ -5994,56 +6139,64 @@ SDValue SITargetLowering::lowerImage(SDValue Op,

MVT VAddrVT =

Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();

MVT VAddrScalarVT = VAddrVT.getScalarType();

- MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;

+ MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;

IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;

VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();

VAddrScalarVT = VAddrVT.getScalarType();

+ MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;

IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;

- if (IsA16 || IsG16) {

- if (IsA16) {

- if (!ST->hasA16()) {

- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "

- "support 16 bit addresses\n");

- return Op;

- }

- if (!IsG16) {

- LLVM_DEBUG(

- dbgs() << "Failed to lower image intrinsic: 16 bit addresses "

- "need 16 bit derivatives but got 32 bit derivatives\n");

- return Op;

- }

- } else if (!ST->hasG16()) {

+ if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {

+ // 16 bit gradients are supported, but are tied to the A16 control

+ // so both gradients and addresses must be 16 bit

+ LLVM_DEBUG(

+ dbgs() << "Failed to lower image intrinsic: 16 bit addresses "

+ "require 16 bit args for both gradients and addresses");

+ return Op;

+ }

+ if (IsA16) {

+ if (!ST->hasA16()) {

LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "

- "support 16 bit derivatives\n");

+ "support 16 bit addresses\n");

return Op;

}

+ }

- if (BaseOpcode->Gradients && !IsA16) {

- if (!ST->hasG16()) {

- LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "

- "support 16 bit derivatives\n");

- return Op;

- }

- // Activate g16

- const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =

- AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);

- IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16

- }

+ // We've dealt with incorrect input so we know that if IsA16, IsG16

+ // are set then we have to compress/pack operands (either address,

+ // gradient or both)

+ // In the case where a16 and gradients are tied (no G16 support) then we

+ // have already verified that both IsA16 and IsG16 are true

+ if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {

+ // Activate g16

+ const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =

+ AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);

+ IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16

+ }

- // Don't compress addresses for G16

- const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);

- packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs,

- ArgOffset + Intr->GradientStart, PackEndIdx,

- Intr->NumGradients);

+ // Add gradients (packed or unpacked)

+ if (IsG16) {

+ // Pack the gradients

+ // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);

+ packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,

+ ArgOffset + Intr->GradientStart,

+ ArgOffset + Intr->CoordStart, Intr->NumGradients);

+ } else {

+ for (unsigned I = ArgOffset + Intr->GradientStart;

+ I < ArgOffset + Intr->CoordStart; I++)

+ VAddrs.push_back(Op.getOperand(I));

+ }

- if (!IsA16) {

- // Add uncompressed address

- for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)

- VAddrs.push_back(Op.getOperand(I));

- }

+ // Add addresses (packed or unpacked)

+ if (IsA16) {

+ packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,

+ ArgOffset + Intr->CoordStart, VAddrEnd,

+ 0 /* No gradients */);

} else {

- for (unsigned I = ArgOffset + Intr->GradientStart; I < VAddrEnd; I++)

+ // Add uncompressed address

+ for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)

VAddrs.push_back(Op.getOperand(I));

}

@@ -6058,8 +6211,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,

// SIShrinkInstructions will convert NSA encodings to non-NSA after register

// allocation when possible.

- bool UseNSA =

- ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;

+ bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&

+ VAddrs.size() >= 3 &&

+ VAddrs.size() <= (unsigned)ST->getNSAMaxSize();

SDValue VAddr;

if (!UseNSA)

VAddr = getBuildDwordsVector(DAG, DL, VAddrs);

@@ -6120,19 +6274,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,

}

- SDValue GLC;

- SDValue SLC;

- SDValue DLC;

- if (BaseOpcode->Atomic) {

- GLC = True; // TODO no-return optimization

- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),

- DAG, nullptr, &SLC, IsGFX10Plus ? &DLC : nullptr))

- return Op;

- } else {

- if (!parseCachePolicy(Op.getOperand(ArgOffset + Intr->CachePolicyIndex),

- DAG, &GLC, &SLC, IsGFX10Plus ? &DLC : nullptr))

- return Op;

- }

+ unsigned CPol = cast<ConstantSDNode>(

+ Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();

+ if (BaseOpcode->Atomic)

+ CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization

+ if (CPol & ~AMDGPU::CPol::ALL)

+ return Op;

SmallVector<SDValue, 26> Ops;

if (BaseOpcode->Store || BaseOpcode->Atomic)

@@ -6148,16 +6295,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,

if (IsGFX10Plus)

Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));

Ops.push_back(Unorm);

- if (IsGFX10Plus)

- Ops.push_back(DLC);

- Ops.push_back(GLC);

- Ops.push_back(SLC);

+ Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));

Ops.push_back(IsA16 && // r128, a16 for gfx9

ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);

if (IsGFX10Plus)

Ops.push_back(IsA16 ? True : False);

- Ops.push_back(TFE);

- Ops.push_back(LWE);

+ if (!Subtarget->hasGFX90AInsts()) {

+ Ops.push_back(TFE); //tfe

+ } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {

+ report_fatal_error("TFE is not supported on this GPU");

+ }

+ Ops.push_back(LWE); // lwe

if (!IsGFX10Plus)

Ops.push_back(DimInfo->DA ? True : False);

if (BaseOpcode->HasD16)

@@ -6175,7 +6323,15 @@ SDValue SITargetLowering::lowerImage(SDValue Op,

: AMDGPU::MIMGEncGfx10Default,

NumVDataDwords, NumVAddrDwords);

} else {

- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

+ if (Subtarget->hasGFX90AInsts()) {

+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,

+ NumVDataDwords, NumVAddrDwords);

+ if (Opcode == -1)

+ report_fatal_error(

+ "requested image instruction is not supported on this GPU");

+ }

+ if (Opcode == -1 &&

+ Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)

Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,

NumVDataDwords, NumVAddrDwords);

if (Opcode == -1)

@@ -6194,15 +6350,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,

SmallVector<SDValue, 1> Elt;

DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);

return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);

- } else if (!BaseOpcode->Store) {

- return constructRetValue(DAG, NewNode,

- OrigResultTypes, IsTexFail,

- Subtarget->hasUnpackedD16VMem(), IsD16,

- DMaskLanes, NumVDataDwords, DL,

- *DAG.getContext());

}

- return SDValue(NewNode, 0);

+ if (BaseOpcode->Store)

+ return SDValue(NewNode, 0);

+ return constructRetValue(DAG, NewNode,

+ OrigResultTypes, IsTexFail,

+ Subtarget->hasUnpackedD16VMem(), IsD16,

+ DMaskLanes, NumVDataDwords, DL);

}

SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,

@@ -6448,11 +6602,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),

SDLoc(Op), MVT::i32);

case Intrinsic::amdgcn_s_buffer_load: {

- bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);

- SDValue GLC;

- SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);

- if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,

- IsGFX10Plus ? &DLC : nullptr))

+ unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();

+ if (CPol & ~AMDGPU::CPol::ALL)

return Op;

return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

DAG);

@@ -6607,6 +6758,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

case Intrinsic::amdgcn_alignbit:

return DAG.getNode(ISD::FSHR, DL, VT,

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

+ case Intrinsic::amdgcn_perm:

+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),

+ Op.getOperand(2), Op.getOperand(3));

case Intrinsic::amdgcn_reloc_constant: {

Module *M = const_cast<Module *>(MF.getFunction().getParent());

const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();

@@ -6626,28 +6780,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

}

-// This function computes an appropriate offset to pass to

-// MachineMemOperand::setOffset() based on the offset inputs to

-// an intrinsic. If any of the offsets are non-contstant or

-// if VIndex is non-zero then this function returns 0. Otherwise,

-// it returns the sum of VOffset, SOffset, and Offset.

-static unsigned getBufferOffsetForMMO(SDValue VOffset,

- SDValue SOffset,

- SDValue Offset,

- SDValue VIndex = SDValue()) {

+/// Update \p MMO based on the offset inputs to an intrinsic.

+static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,

+ SDValue SOffset, SDValue Offset,

+ SDValue VIndex = SDValue()) {

if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||

- !isa<ConstantSDNode>(Offset))

- return 0;

+ !isa<ConstantSDNode>(Offset)) {

+ // The combined offset is not known to be constant, so we cannot represent

+ // it in the MMO. Give up.

+ MMO->setValue((Value *)nullptr);

+ return;

+ }

- if (VIndex) {

- if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())

- return 0;

+ if (VIndex && (!isa<ConstantSDNode>(VIndex) ||

+ !cast<ConstantSDNode>(VIndex)->isNullValue())) {

+ // The strided index component of the address is not known to be zero, so we

+ // cannot represent it in the MMO. Give up.

+ MMO->setValue((Value *)nullptr);

+ return;

}

- return cast<ConstantSDNode>(VOffset)->getSExtValue() +

- cast<ConstantSDNode>(SOffset)->getSExtValue() +

- cast<ConstantSDNode>(Offset)->getSExtValue();

+ MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() +

+ cast<ConstantSDNode>(SOffset)->getSExtValue() +

+ cast<ConstantSDNode>(Offset)->getSExtValue());

}

SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,

@@ -6670,13 +6825,21 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,

};

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));

+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);

EVT MemVT = VData.getValueType();

return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,

M->getMemOperand());

}

+// Return a value to use for the idxen operand by examining the vindex operand.

+static unsigned getIdxEn(SDValue VIndex) {

+ if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex))

+ // No need to set idxen if vindex is known to be zero.

+ return VIndexC->getZExtValue() != 0;

+ return 1;

SDValue

SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,

unsigned NewOpcode) const {

@@ -6697,8 +6860,7 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,

};

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],

- Ops[3]));

+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);

EVT MemVT = VData.getValueType();

return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,

@@ -6811,9 +6973,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

case Intrinsic::amdgcn_buffer_load_format: {

unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();

unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();

- unsigned IdxEn = 1;

- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))

- IdxEn = Idx->getZExtValue() != 0;

+ unsigned IdxEn = getIdxEn(Op.getOperand(3));

SDValue Ops[] = {

Op.getOperand(0), // Chain

Op.getOperand(2), // rsrc

@@ -6824,11 +6984,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy

DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen

};

- unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);

- // We don't know the offset if vindex is non-zero, so clear it.

- if (IdxEn)

- Offset = 0;

+ setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);

unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?

AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;

@@ -6836,7 +6992,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

EVT VT = Op.getValueType();

EVT IntVT = VT.changeTypeToInteger();

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(Offset);

+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);

EVT LoadVT = Op.getValueType();

if (LoadVT.getScalarType() == MVT::f16)

@@ -6868,7 +7024,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

};

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));

+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]);

return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);

}

case Intrinsic::amdgcn_struct_buffer_load:

@@ -6888,8 +7044,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

};

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],

- Ops[2]));

+ updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);

return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);

}

case Intrinsic::amdgcn_tbuffer_load: {

@@ -6900,9 +7055,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();

unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();

unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();

- unsigned IdxEn = 1;

- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))

- IdxEn = Idx->getZExtValue() != 0;

+ unsigned IdxEn = getIdxEn(Op.getOperand(3));

SDValue Ops[] = {

Op.getOperand(0), // Chain

Op.getOperand(2), // rsrc

@@ -6983,9 +7136,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

case Intrinsic::amdgcn_buffer_atomic_xor:

case Intrinsic::amdgcn_buffer_atomic_fadd: {

unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();

- unsigned IdxEn = 1;

- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))

- IdxEn = Idx->getZExtValue() != 0;

+ unsigned IdxEn = getIdxEn(Op.getOperand(4));

SDValue Ops[] = {

Op.getOperand(0), // Chain

Op.getOperand(2), // vdata

@@ -6997,14 +7148,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy

DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen

};

- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);

- // We don't know the offset if vindex is non-zero, so clear it.

- if (IdxEn)

- Offset = 0;

+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);

EVT VT = Op.getValueType();

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(Offset);

+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);

unsigned Opcode = 0;

switch (IntrID) {

@@ -7042,7 +7191,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;

break;

case Intrinsic::amdgcn_buffer_atomic_fadd:

- if (!Op.getValue(0).use_empty()) {

+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {

DiagnosticInfoUnsupported

NoFpRet(DAG.getMachineFunction().getFunction(),

"return versions of fp atomics not supported",

@@ -7063,6 +7212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);

case Intrinsic::amdgcn_struct_buffer_atomic_fadd:

return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);

+ case Intrinsic::amdgcn_raw_buffer_atomic_fmin:

+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);

+ case Intrinsic::amdgcn_struct_buffer_atomic_fmin:

+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);

+ case Intrinsic::amdgcn_raw_buffer_atomic_fmax:

+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);

+ case Intrinsic::amdgcn_struct_buffer_atomic_fmax:

+ return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);

case Intrinsic::amdgcn_raw_buffer_atomic_swap:

return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);

case Intrinsic::amdgcn_raw_buffer_atomic_add:

@@ -7119,9 +7276,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

case Intrinsic::amdgcn_buffer_atomic_cmpswap: {

unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();

- unsigned IdxEn = 1;

- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))

- IdxEn = Idx->getZExtValue() != 0;

+ unsigned IdxEn = getIdxEn(Op.getOperand(5));

SDValue Ops[] = {

Op.getOperand(0), // Chain

Op.getOperand(2), // src

@@ -7134,13 +7289,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy

DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen

};

- unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);

- // We don't know the offset if vindex is non-zero, so clear it.

- if (IdxEn)

- Offset = 0;

+ setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);

EVT VT = Op.getValueType();

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(Offset);

+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);

return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,

Op->getVTList(), Ops, VT, M->getMemOperand());

@@ -7161,7 +7314,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

};

EVT VT = Op.getValueType();

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));

+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]);

return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,

Op->getVTList(), Ops, VT, M->getMemOperand());

@@ -7182,33 +7335,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

};

EVT VT = Op.getValueType();

auto *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],

- Ops[4]));

+ updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);

return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,

Op->getVTList(), Ops, VT, M->getMemOperand());

}

- case Intrinsic::amdgcn_global_atomic_fadd: {

- if (!Op.getValue(0).use_empty()) {

- DiagnosticInfoUnsupported

- NoFpRet(DAG.getMachineFunction().getFunction(),

- "return versions of fp atomics not supported",

- DL.getDebugLoc(), DS_Error);

- DAG.getContext()->diagnose(NoFpRet);

- return SDValue();

- }

- MemSDNode *M = cast<MemSDNode>(Op);

- SDValue Ops[] = {

- M->getOperand(0), // Chain

- M->getOperand(2), // Ptr

- M->getOperand(3) // Value

- };

- EVT VT = Op.getOperand(3).getValueType();

- return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,

- DAG.getVTList(VT, MVT::Other), Ops,

- M->getMemOperand());

- }

case Intrinsic::amdgcn_image_bvh_intersect_ray: {

SDLoc DL(Op);

MemSDNode *M = cast<MemSDNode>(Op);

@@ -7224,6 +7355,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

assert(RayDir.getValueType() == MVT::v4f16 ||

RayDir.getValueType() == MVT::v4f32);

+ if (!Subtarget->hasGFX10_AEncoding()) {

+ emitRemovedIntrinsicError(DAG, DL, Op.getValueType());

+ return SDValue();

+ }

bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;

bool Is64 = NodePtr.getValueType() == MVT::i64;

unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa

@@ -7279,7 +7415,55 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,

DAG.setNodeMemRefs(NewNode, {MemRef});

return SDValue(NewNode, 0);

}

+ case Intrinsic::amdgcn_global_atomic_fadd:

+ if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {

+ DiagnosticInfoUnsupported

+ NoFpRet(DAG.getMachineFunction().getFunction(),

+ "return versions of fp atomics not supported",

+ DL.getDebugLoc(), DS_Error);

+ DAG.getContext()->diagnose(NoFpRet);

+ return SDValue();

+ }

+ LLVM_FALLTHROUGH;

+ case Intrinsic::amdgcn_global_atomic_fmin:

+ case Intrinsic::amdgcn_global_atomic_fmax:

+ case Intrinsic::amdgcn_flat_atomic_fadd:

+ case Intrinsic::amdgcn_flat_atomic_fmin:

+ case Intrinsic::amdgcn_flat_atomic_fmax: {

+ MemSDNode *M = cast<MemSDNode>(Op);

+ SDValue Ops[] = {

+ M->getOperand(0), // Chain

+ M->getOperand(2), // Ptr

+ M->getOperand(3) // Value

+ };

+ unsigned Opcode = 0;

+ switch (IntrID) {

+ case Intrinsic::amdgcn_global_atomic_fadd:

+ case Intrinsic::amdgcn_flat_atomic_fadd: {

+ EVT VT = Op.getOperand(3).getValueType();

+ return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,

+ DAG.getVTList(VT, MVT::Other), Ops,

+ M->getMemOperand());

+ }

+ case Intrinsic::amdgcn_global_atomic_fmin:

+ case Intrinsic::amdgcn_flat_atomic_fmin: {

+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;

+ break;

+ }

+ case Intrinsic::amdgcn_global_atomic_fmax:

+ case Intrinsic::amdgcn_flat_atomic_fmax: {

+ Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;

+ break;

+ }

+ default:

+ llvm_unreachable("unhandled atomic opcode");

+ }

+ return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),

+ M->getVTList(), Ops, M->getMemoryVT(),

+ M->getMemOperand());

+ }

default:

if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

AMDGPU::getImageDimIntrinsicInfo(IntrID))

return lowerImage(Op, ImageDimIntr, DAG, true);

@@ -7448,9 +7632,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();

unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();

unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();

- unsigned IdxEn = 1;

- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))

- IdxEn = Idx->getZExtValue() != 0;

+ unsigned IdxEn = getIdxEn(Op.getOperand(4));

SDValue Ops[] = {

Chain,

VData, // vdata

@@ -7461,7 +7643,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

Op.getOperand(7), // offset

DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format

DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy

- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen

+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen

};

unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :

AMDGPUISD::TBUFFER_STORE_FORMAT;

@@ -7486,7 +7668,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

Offsets.second, // offset

Op.getOperand(7), // format

Op.getOperand(8), // cachepolicy, swizzled buffer

- DAG.getTargetConstant(1, DL, MVT::i1), // idexen

+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen

};

unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :

AMDGPUISD::TBUFFER_STORE_FORMAT;

@@ -7511,7 +7693,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

Offsets.second, // offset

Op.getOperand(6), // format

Op.getOperand(7), // cachepolicy, swizzled buffer

- DAG.getTargetConstant(0, DL, MVT::i1), // idexen

+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen

};

unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :

AMDGPUISD::TBUFFER_STORE_FORMAT;

@@ -7528,9 +7710,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

VData = handleD16VData(VData, DAG);

unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();

unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();

- unsigned IdxEn = 1;

- if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))

- IdxEn = Idx->getZExtValue() != 0;

+ unsigned IdxEn = getIdxEn(Op.getOperand(4));

SDValue Ops[] = {

Chain,

VData,

@@ -7542,15 +7722,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy

DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen

};

- unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);

- // We don't know the offset if vindex is non-zero, so clear it.

- if (IdxEn)

- Offset = 0;

+ setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);

unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?

AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;

Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;

MemSDNode *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(Offset);

+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);

// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics

EVT VDataType = VData.getValueType().getScalarType();

@@ -7597,7 +7775,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;

Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;

MemSDNode *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));

+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);

// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics

if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)

@@ -7644,8 +7822,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;

Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;

MemSDNode *M = cast<MemSDNode>(Op);

- M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],

- Ops[3]));

+ updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);

// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics

EVT VDataType = VData.getValueType().getScalarType();

@@ -7725,9 +7902,9 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(

// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the

// three offsets (voffset, soffset and instoffset) into the SDValue[3] array

// pointed to by Offsets.

-unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,

- SelectionDAG &DAG, SDValue *Offsets,

- Align Alignment) const {

+void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,

+ SelectionDAG &DAG, SDValue *Offsets,

+ Align Alignment) const {

SDLoc DL(CombinedOffset);

if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {

uint32_t Imm = C->getZExtValue();

@@ -7737,7 +7914,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,

Offsets[0] = DAG.getConstant(0, DL, MVT::i32);

Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);

Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);

- return SOffset + ImmOffset;

+ return;

}

if (DAG.isBaseWithConstantOffset(CombinedOffset)) {

@@ -7750,13 +7927,12 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,

Offsets[0] = N0;

Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);

Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);

- return 0;

+ return;

}

Offsets[0] = CombinedOffset;

Offsets[1] = DAG.getConstant(0, DL, MVT::i32);

Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);

- return 0;

}

// Handle 8 bit and 16 bit buffer loads

@@ -8263,8 +8439,8 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {

// Returns immediate value for setting the F32 denorm mode when using the

// S_DENORM_MODE instruction.

-static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,

- const SDLoc &SL, const GCNSubtarget *ST) {

+static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,

+ const SDLoc &SL, const GCNSubtarget *ST) {

assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");

int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())

? FP_DENORM_FLUSH_NONE

@@ -8794,18 +8970,20 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(

}

// Returns true if argument is a boolean value which is not serialized into

-// memory or argument and does not require v_cmdmask_b32 to be deserialized.

+// memory or argument and does not require v_cndmask_b32 to be deserialized.

static bool isBoolSGPR(SDValue V) {

if (V.getValueType() != MVT::i1)

return false;

switch (V.getOpcode()) {

- default: break;

+ default:

+ break;

case ISD::SETCC:

+ case AMDGPUISD::FP_CLASS:

+ return true;

case ISD::AND:

case ISD::OR:

case ISD::XOR:

- case AMDGPUISD::FP_CLASS:

- return true;

+ return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));

}

return false;

}

@@ -9206,63 +9384,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,

return SDValue();

}

-// Instructions that will be lowered with a final instruction that zeros the

-// high result bits.

-// XXX - probably only need to list legal operations.

-static bool fp16SrcZerosHighBits(unsigned Opc) {

- switch (Opc) {

- case ISD::FADD:

- case ISD::FSUB:

- case ISD::FMUL:

- case ISD::FDIV:

- case ISD::FREM:

- case ISD::FMA:

- case ISD::FMAD:

- case ISD::FCANONICALIZE:

- case ISD::FP_ROUND:

- case ISD::UINT_TO_FP:

- case ISD::SINT_TO_FP:

- case ISD::FABS:

- // Fabs is lowered to a bit operation, but it's an and which will clear the

- // high bits anyway.

- case ISD::FSQRT:

- case ISD::FSIN:

- case ISD::FCOS:

- case ISD::FPOWI:

- case ISD::FPOW:

- case ISD::FLOG:

- case ISD::FLOG2:

- case ISD::FLOG10:

- case ISD::FEXP:

- case ISD::FEXP2:

- case ISD::FCEIL:

- case ISD::FTRUNC:

- case ISD::FRINT:

- case ISD::FNEARBYINT:

- case ISD::FROUND:

- case ISD::FFLOOR:

- case ISD::FMINNUM:

- case ISD::FMAXNUM:

- case AMDGPUISD::FRACT:

- case AMDGPUISD::CLAMP:

- case AMDGPUISD::COS_HW:

- case AMDGPUISD::SIN_HW:

- case AMDGPUISD::FMIN3:

- case AMDGPUISD::FMAX3:

- case AMDGPUISD::FMED3:

- case AMDGPUISD::FMAD_FTZ:

- case AMDGPUISD::RCP:

- case AMDGPUISD::RSQ:

- case AMDGPUISD::RCP_IFLAG:

- case AMDGPUISD::LDEXP:

- return true;

- default:

- // fcopysign, select and others may be lowered to 32-bit bit operations

- // which don't zero the high bits.

- return false;

- }

SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,

DAGCombinerInfo &DCI) const {

if (!Subtarget->has16BitInsts() ||

@@ -9277,15 +9398,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,

if (Src.getValueType() != MVT::i16)

return SDValue();

- // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src

- // FIXME: It is not universally true that the high bits are zeroed on gfx9.

- if (Src.getOpcode() == ISD::BITCAST) {

- SDValue BCSrc = Src.getOperand(0);

- if (BCSrc.getValueType() == MVT::f16 &&

- fp16SrcZerosHighBits(BCSrc.getOpcode()))

- return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);

- }

return SDValue();

}

@@ -9482,19 +9594,18 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,

// Could be anything.

return false;

- case ISD::BITCAST: {

+ case ISD::BITCAST:

+ return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);

+ case ISD::TRUNCATE: {

// Hack round the mess we make when legalizing extract_vector_elt

- SDValue Src = Op.getOperand(0);

- if (Src.getValueType() == MVT::i16 &&

- Src.getOpcode() == ISD::TRUNCATE) {

- SDValue TruncSrc = Src.getOperand(0);

+ if (Op.getValueType() == MVT::i16) {

+ SDValue TruncSrc = Op.getOperand(0);

if (TruncSrc.getValueType() == MVT::i32 &&

TruncSrc.getOpcode() == ISD::BITCAST &&

TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {

return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);

}

return false;

}

case ISD::INTRINSIC_WO_CHAIN: {

@@ -9527,6 +9638,45 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,

llvm_unreachable("invalid operation");

}

+bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,

+ unsigned MaxDepth) const {

+ MachineRegisterInfo &MRI = MF.getRegInfo();

+ MachineInstr *MI = MRI.getVRegDef(Reg);

+ unsigned Opcode = MI->getOpcode();

+ if (Opcode == AMDGPU::G_FCANONICALIZE)

+ return true;

+ if (Opcode == AMDGPU::G_FCONSTANT) {

+ auto F = MI->getOperand(1).getFPImm()->getValueAPF();

+ if (F.isNaN() && F.isSignaling())

+ return false;

+ return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);

+ }

+ if (MaxDepth == 0)

+ return false;

+ switch (Opcode) {

+ case AMDGPU::G_FMINNUM_IEEE:

+ case AMDGPU::G_FMAXNUM_IEEE: {

+ if (Subtarget->supportsMinMaxDenormModes() ||

+ denormalsEnabledForType(MRI.getType(Reg), MF))

+ return true;

+ for (unsigned I = 1, E = MI->getNumOperands(); I != E; ++I) {

+ if (!isCanonicalized(MI->getOperand(I).getReg(), MF, MaxDepth - 1))

+ return false;

+ }

+ return true;

+ }

+ default:

+ return denormalsEnabledForType(MRI.getType(Reg), MF) &&

+ isKnownNeverSNaN(Reg, MRI);

+ }

+ llvm_unreachable("invalid operation");

// Constant fold canonicalize.

SDValue SITargetLowering::getCanonicalConstantFP(

SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {

@@ -9694,15 +9844,19 @@ SDValue SITargetLowering::performIntMed3ImmCombine(

}

// If there isn't a 16-bit med3 operation, convert to 32-bit.

- MVT NVT = MVT::i32;

- unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

+ if (VT == MVT::i16) {

+ MVT NVT = MVT::i32;

+ unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

+ SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));

+ SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));

+ SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);

- SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));

- SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));

- SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);

+ SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);

+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);

+ }

- SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);

- return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);

+ return SDValue();

}

static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {

@@ -10408,7 +10562,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,

EVT VT = N->getValueType(0);

SDLoc SL(N);

- if (!Subtarget->hasDot2Insts() || VT != MVT::f32)

+ if (!Subtarget->hasDot7Insts() || VT != MVT::f32)

return SDValue();

// FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->

@@ -10791,7 +10945,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,

unsigned NewDmask = 0;

unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;

unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;

- bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||

+ bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||

Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;

unsigned TFCLane = 0;

bool HasChain = Node->getNumValues() > 1;

@@ -11067,6 +11221,95 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,

return Node;

}

+// Any MIMG instructions that use tfe or lwe require an initialization of the

+// result register that will be written in the case of a memory access failure.

+// The required code is also added to tie this init code to the result of the

+// img instruction.

+void SITargetLowering::AddIMGInit(MachineInstr &MI) const {

+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

+ const SIRegisterInfo &TRI = TII->getRegisterInfo();

+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();

+ MachineBasicBlock &MBB = *MI.getParent();

+ MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);

+ MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);

+ MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);

+ if (!TFE && !LWE) // intersect_ray

+ return;

+ unsigned TFEVal = TFE ? TFE->getImm() : 0;

+ unsigned LWEVal = LWE->getImm();

+ unsigned D16Val = D16 ? D16->getImm() : 0;

+ if (!TFEVal && !LWEVal)

+ return;

+ // At least one of TFE or LWE are non-zero

+ // We have to insert a suitable initialization of the result value and

+ // tie this to the dest of the image instruction.

+ const DebugLoc &DL = MI.getDebugLoc();

+ int DstIdx =

+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);

+ // Calculate which dword we have to initialize to 0.

+ MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);

+ // check that dmask operand is found.

+ assert(MO_Dmask && "Expected dmask operand in instruction");

+ unsigned dmask = MO_Dmask->getImm();

+ // Determine the number of active lanes taking into account the

+ // Gather4 special case

+ unsigned ActiveLanes = TII->isGather4(MI) ? 4 : countPopulation(dmask);

+ bool Packed = !Subtarget->hasUnpackedD16VMem();

+ unsigned InitIdx =

+ D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;

+ // Abandon attempt if the dst size isn't large enough

+ // - this is in fact an error but this is picked up elsewhere and

+ // reported correctly.

+ uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;

+ if (DstSize < InitIdx)

+ return;

+ // Create a register for the intialization value.

+ Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));

+ unsigned NewDst = 0; // Final initialized value will be in here

+ // If PRTStrictNull feature is enabled (the default) then initialize

+ // all the result registers to 0, otherwise just the error indication

+ // register (VGPRn+1)

+ unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;

+ unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);

+ for (; SizeLeft; SizeLeft--, CurrIdx++) {

+ NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));

+ // Initialize dword

+ Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)

+ .addImm(0);

+ // Insert into the super-reg

+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)

+ .addReg(PrevDst)

+ .addReg(SubReg)

+ .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));

+ PrevDst = NewDst;

+ }

+ // Add as an implicit operand

+ MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));

+ // Tie the just added implicit operand to the dst

+ MI.tieOperands(DstIdx, MI.getNumOperands() - 1);

/// Assign the register class depending on the number of

/// bits set in the writemask

void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,

@@ -11114,10 +11357,12 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,

int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());

if (NoRetAtomicOp != -1) {

if (!Node->hasAnyUseOfValue(0)) {

- int Glc1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),

- AMDGPU::OpName::glc1);

- if (Glc1Idx != -1)

- MI.RemoveOperand(Glc1Idx);

+ int CPolIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),

+ AMDGPU::OpName::cpol);

+ if (CPolIdx != -1) {

+ MachineOperand &CPol = MI.getOperand(CPolIdx);

+ CPol.setImm(CPol.getImm() & ~AMDGPU::CPol::GLC);

+ }

MI.RemoveOperand(0);

MI.setDesc(TII->get(NoRetAtomicOp));

return;

@@ -11148,6 +11393,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,

}

return;

}

+ if (TII->isMIMG(MI) && !MI.mayStore())

+ AddIMGInit(MI);

}

static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,

@@ -11226,9 +11474,11 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,

//===----------------------------------------------------------------------===//

std::pair<unsigned, const TargetRegisterClass *>

-SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

+SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,

StringRef Constraint,

MVT VT) const {

+ const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);

const TargetRegisterClass *RC = nullptr;

if (Constraint.size() == 1) {

const unsigned BitWidth = VT.getSizeInBits();

@@ -11257,7 +11507,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

RC = &AMDGPU::VGPR_32RegClass;

break;

default:

- RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);

+ RC = TRI->getVGPRClassForBitWidth(BitWidth);

if (!RC)

return std::make_pair(0U, nullptr);

break;

@@ -11271,7 +11521,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

RC = &AMDGPU::AGPR_32RegClass;

break;

default:

- RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);

+ RC = TRI->getAGPRClassForBitWidth(BitWidth);

if (!RC)

return std::make_pair(0U, nullptr);

break;

@@ -11444,6 +11694,47 @@ bool SITargetLowering::checkAsmConstraintValA(SDValue Op,

return false;

}

+static int getAlignedAGPRClassID(unsigned UnalignedClassID) {

+ switch (UnalignedClassID) {

+ case AMDGPU::VReg_64RegClassID:

+ return AMDGPU::VReg_64_Align2RegClassID;

+ case AMDGPU::VReg_96RegClassID:

+ return AMDGPU::VReg_96_Align2RegClassID;

+ case AMDGPU::VReg_128RegClassID:

+ return AMDGPU::VReg_128_Align2RegClassID;

+ case AMDGPU::VReg_160RegClassID:

+ return AMDGPU::VReg_160_Align2RegClassID;

+ case AMDGPU::VReg_192RegClassID:

+ return AMDGPU::VReg_192_Align2RegClassID;

+ case AMDGPU::VReg_224RegClassID:

+ return AMDGPU::VReg_224_Align2RegClassID;

+ case AMDGPU::VReg_256RegClassID:

+ return AMDGPU::VReg_256_Align2RegClassID;

+ case AMDGPU::VReg_512RegClassID:

+ return AMDGPU::VReg_512_Align2RegClassID;

+ case AMDGPU::VReg_1024RegClassID:

+ return AMDGPU::VReg_1024_Align2RegClassID;

+ case AMDGPU::AReg_64RegClassID:

+ return AMDGPU::AReg_64_Align2RegClassID;

+ case AMDGPU::AReg_96RegClassID:

+ return AMDGPU::AReg_96_Align2RegClassID;

+ case AMDGPU::AReg_128RegClassID:

+ return AMDGPU::AReg_128_Align2RegClassID;

+ case AMDGPU::AReg_160RegClassID:

+ return AMDGPU::AReg_160_Align2RegClassID;

+ case AMDGPU::AReg_192RegClassID:

+ return AMDGPU::AReg_192_Align2RegClassID;

+ case AMDGPU::AReg_256RegClassID:

+ return AMDGPU::AReg_256_Align2RegClassID;

+ case AMDGPU::AReg_512RegClassID:

+ return AMDGPU::AReg_512_Align2RegClassID;

+ case AMDGPU::AReg_1024RegClassID:

+ return AMDGPU::AReg_1024_Align2RegClassID;

+ default:

+ return -1;

+ }

// Figure out which registers should be reserved for stack access. Only after

// the function is legalized do we know all of the non-spill stack objects or if

// calls are present.

@@ -11452,6 +11743,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {

SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();

const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();

+ const SIInstrInfo *TII = ST.getInstrInfo();

if (Info->isEntryFunction()) {

// Callable functions have fixed registers used for stack access.

@@ -11474,7 +11766,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {

Info->limitOccupancy(MF);

if (ST.isWave32() && !MF.empty()) {

- const SIInstrInfo *TII = ST.getInstrInfo();

for (auto &MBB : MF) {

for (auto &MI : MBB) {

TII->fixImplicitOperands(MI);

@@ -11482,13 +11773,30 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {

}

+ // FIXME: This is a hack to fixup AGPR classes to use the properly aligned

+ // classes if required. Ideally the register class constraints would differ

+ // per-subtarget, but there's no easy way to achieve that right now. This is

+ // not a problem for VGPRs because the correctly aligned VGPR class is implied

+ // from using them as the register class for legal types.

+ if (ST.needsAlignedVGPRs()) {

+ for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {

+ const Register Reg = Register::index2VirtReg(I);

+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);

+ if (!RC)

+ continue;

+ int NewClassID = getAlignedAGPRClassID(RC->getID());

+ if (NewClassID != -1)

+ MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));

+ }

TargetLoweringBase::finalizeLowering(MF);

// Allocate a VGPR for future SGPR Spill if

// "amdgpu-reserve-vgpr-for-sgpr-spill" option is used

// FIXME: We won't need this hack if we split SGPR allocation from VGPR

- if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill &&

- !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects())

+ if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&

+ !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())

Info->reserveVGPRforSGPRSpills(MF);

}

@@ -11690,8 +11998,37 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(

case ISD::INTRINSIC_W_CHAIN:

return AMDGPU::isIntrinsicSourceOfDivergence(

cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());

+ case AMDGPUISD::ATOMIC_CMP_SWAP:

+ case AMDGPUISD::ATOMIC_INC:

+ case AMDGPUISD::ATOMIC_DEC:

+ case AMDGPUISD::ATOMIC_LOAD_FMIN:

+ case AMDGPUISD::ATOMIC_LOAD_FMAX:

+ case AMDGPUISD::BUFFER_ATOMIC_SWAP:

+ case AMDGPUISD::BUFFER_ATOMIC_ADD:

+ case AMDGPUISD::BUFFER_ATOMIC_SUB:

+ case AMDGPUISD::BUFFER_ATOMIC_SMIN:

+ case AMDGPUISD::BUFFER_ATOMIC_UMIN:

+ case AMDGPUISD::BUFFER_ATOMIC_SMAX:

+ case AMDGPUISD::BUFFER_ATOMIC_UMAX:

+ case AMDGPUISD::BUFFER_ATOMIC_AND:

+ case AMDGPUISD::BUFFER_ATOMIC_OR:

+ case AMDGPUISD::BUFFER_ATOMIC_XOR:

+ case AMDGPUISD::BUFFER_ATOMIC_INC:

+ case AMDGPUISD::BUFFER_ATOMIC_DEC:

+ case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:

+ case AMDGPUISD::BUFFER_ATOMIC_CSUB:

+ case AMDGPUISD::BUFFER_ATOMIC_FADD:

+ case AMDGPUISD::BUFFER_ATOMIC_FMIN:

+ case AMDGPUISD::BUFFER_ATOMIC_FMAX:

+ // Target-specific read-modify-write atomics are sources of divergence.

+ return true;

+ default:

+ if (auto *A = dyn_cast<AtomicSDNode>(N)) {

+ // Generic read-modify-write atomics are sources of divergence.

+ return A->readMem() && A->writeMem();

+ }

+ return false;

}

- return false;

}

bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,

@@ -11707,6 +12044,19 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,

}

+bool SITargetLowering::denormalsEnabledForType(LLT Ty,

+ MachineFunction &MF) const {

+ switch (Ty.getScalarSizeInBits()) {

+ case 32:

+ return hasFP32Denormals(MF);

+ case 64:

+ case 16:

+ return hasFP64FP16Denormals(MF);

+ default:

+ return false;

+ }

bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,

const SelectionDAG &DAG,

bool SNaN,

@@ -11745,24 +12095,57 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {

if (Ty->isHalfTy())

return AtomicExpansionKind::None;

- if (!Ty->isFloatTy())

+ if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))

return AtomicExpansionKind::CmpXChg;

- // TODO: Do have these for flat. Older targets also had them for buffers.

unsigned AS = RMW->getPointerAddressSpace();

- if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {

- if (!fpModeMatchesGlobalFPAtomicMode(RMW))

+ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&

+ Subtarget->hasAtomicFaddInsts()) {

+ // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe

+ // floating point atomic instructions. May generate more efficient code,

+ // but may not respect rounding and denormal modes, and may give incorrect

+ // results for certain memory destinations.

+ if (RMW->getFunction()

+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")

+ .getValueAsString() != "true")

+ return AtomicExpansionKind::CmpXChg;

+ if (Subtarget->hasGFX90AInsts()) {

+ if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS)

+ return AtomicExpansionKind::CmpXChg;

+ auto SSID = RMW->getSyncScopeID();

+ if (SSID == SyncScope::System ||

+ SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))

+ return AtomicExpansionKind::CmpXChg;

+ return AtomicExpansionKind::None;

+ }

+ if (AS == AMDGPUAS::FLAT_ADDRESS)

return AtomicExpansionKind::CmpXChg;

- return RMW->use_empty() ? AtomicExpansionKind::None :

- AtomicExpansionKind::CmpXChg;

+ return RMW->use_empty() ? AtomicExpansionKind::None

+ : AtomicExpansionKind::CmpXChg;

}

// DS FP atomics do repect the denormal mode, but the rounding mode is fixed

// to round-to-nearest-even.

- return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?

- AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;

+ // The only exception is DS_ADD_F64 which never flushes regardless of mode.

+ if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {

+ if (!Ty->isDoubleTy())

+ return AtomicExpansionKind::None;

+ return (fpModeMatchesGlobalFPAtomicMode(RMW) ||

+ RMW->getFunction()

+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")

+ .getValueAsString() == "true")

+ ? AtomicExpansionKind::None

+ : AtomicExpansionKind::CmpXChg;

+ }

+ return AtomicExpansionKind::CmpXChg;

}

default:

break;

@@ -11872,10 +12255,11 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,

return hasCFUser(V, Visited, Subtarget->getWavefrontSize());

}

-std::pair<int, MVT>

+std::pair<InstructionCost, MVT>

SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,

Type *Ty) const {

- auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);

+ std::pair<InstructionCost, MVT> Cost =

+ TargetLoweringBase::getTypeLegalizationCost(DL, Ty);

auto Size = DL.getTypeSizeInBits(Ty);

// Maximum load or store can handle 8 dwords for scalar and 4 for

// vector ALU. Let's assume anything above 8 dwords is expensive