aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-07-28 10:51:19 +0000
commiteb11fae6d08f479c0799db45860a98af528fa6e7 (patch)
tree44d492a50c8c1a7eb8e2d17ea3360ec4d066f042 /lib/Target/AMDGPU/SIISelLowering.cpp
parentb8a2042aa938069e862750553db0e4d82d25822c (diff)
downloadsrc-eb11fae6d08f479c0799db45860a98af528fa6e7.tar.gz
src-eb11fae6d08f479c0799db45860a98af528fa6e7.zip
Vendor import of llvm trunk r338150:vendor/llvm/llvm-trunk-r338150
Notes
Notes: svn path=/vendor/llvm/dist/; revision=336809 svn path=/vendor/llvm/llvm-trunk-r338150/; revision=336814; tag=vendor/llvm/llvm-trunk-r338150
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp2200
1 files changed, 1766 insertions, 434 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 50ee88fa635a..5b7fc2656a20 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// \brief Custom DAG lowering for SI
+/// Custom DAG lowering for SI
//
//===----------------------------------------------------------------------===//
@@ -26,6 +26,7 @@
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -49,7 +50,6 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetCallingConv.h"
@@ -73,6 +73,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include <cassert>
@@ -111,8 +112,9 @@ static unsigned findFirstFreeSGPR(CCState &CCInfo) {
}
SITargetLowering::SITargetLowering(const TargetMachine &TM,
- const SISubtarget &STI)
- : AMDGPUTargetLowering(TM, STI) {
+ const GCNSubtarget &STI)
+ : AMDGPUTargetLowering(TM, STI),
+ Subtarget(&STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
@@ -138,14 +140,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
- }
- if (Subtarget->hasVOP3PInsts()) {
+ // Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
+ addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
- computeRegisterProperties(STI.getRegisterInfo());
+ computeRegisterProperties(Subtarget->getRegisterInfo());
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
@@ -173,7 +176,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
- setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
@@ -205,13 +207,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -231,13 +237,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
#endif
- //setOperationAction(ISD::ADDC, MVT::i64, Expand);
- //setOperationAction(ISD::SUBC, MVT::i64, Expand);
-
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
- MVT::v2i64, MVT::v2f64}) {
+ MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -260,6 +263,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
}
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
+
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
// is expanded to avoid having two separate loops in case the index is a VGPR.
@@ -284,12 +289,30 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+
// Avoid stack access for these.
// TODO: Generalize to more vector types.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
+
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
+
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling
@@ -301,7 +324,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
- if (getSubtarget()->hasFlatAddressSpace()) {
+ if (Subtarget->hasFlatAddressSpace()) {
setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
}
@@ -314,13 +337,56 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Custom);
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::FLOG, MVT::f16, Custom);
+ setOperationAction(ISD::FLOG10, MVT::f16, Custom);
+ }
+
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ if (!Subtarget->hasBFI()) {
+ // fcopysign can be done in a single instruction with BFI.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ }
+
+ if (!Subtarget->hasBCNT(32))
+ setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+
+ if (!Subtarget->hasBCNT(64))
+ setOperationAction(ISD::CTPOP, MVT::i64, Expand);
+
+ if (Subtarget->hasFFBH())
+ setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ if (Subtarget->hasFFBL())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+
+ // We only really have 32-bit BFE instructions (and 16-bit on VI).
+ //
+ // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+ // effort to match them now. We want this to be false for i64 cases when the
+ // extraction isn't restricted to the upper or lower half. Ideally we would
+ // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+ // span the midpoint are probably relatively rare, so don't worry about them
+ // for now.
+ if (Subtarget->hasBFE())
+ setHasExtractBitsInsn(true);
+
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
- if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
+ } else {
+ setOperationAction(ISD::FCEIL, MVT::f64, Custom);
+ setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
+ setOperationAction(ISD::FRINT, MVT::f64, Custom);
+ setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
}
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
@@ -357,6 +423,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
setOperationAction(ISD::CTLZ, MVT::i16, Promote);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+ setOperationAction(ISD::CTPOP, MVT::i16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
@@ -406,10 +473,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f16, Legal);
if (!Subtarget->hasFP16Denormals())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
- }
- if (Subtarget->hasVOP3PInsts()) {
- for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+ for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -436,6 +501,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::Constant, MVT::v2i16, Legal);
setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+ setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
+ setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
+
setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
setOperationAction(ISD::STORE, MVT::v2f16, Promote);
@@ -452,11 +520,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
setOperationAction(ISD::XOR, MVT::v2i16, Promote);
AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
- setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
- AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
- setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
- AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
+ setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
+ setOperationAction(ISD::STORE, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
+
+ if (!Subtarget->hasVOP3PInsts()) {
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+ }
+
+ setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+ // This isn't really legal, but this avoids the legalizer unrolling it (and
+ // allows matching fneg (fabs x) patterns)
+ setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+ }
+
+ if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::ADD, MVT::v2i16, Legal);
setOperationAction(ISD::SUB, MVT::v2i16, Legal);
setOperationAction(ISD::MUL, MVT::v2i16, Legal);
@@ -469,26 +564,51 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
setOperationAction(ISD::FADD, MVT::v2f16, Legal);
- setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
setOperationAction(ISD::FMA, MVT::v2f16, Legal);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
-
- // This isn't really legal, but this avoids the legalizer unrolling it (and
- // allows matching fneg (fabs x) patterns)
- setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+ setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
- setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+ setOperationAction(ISD::SHL, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRA, MVT::v4i16, Custom);
+ setOperationAction(ISD::SRL, MVT::v4i16, Custom);
+ setOperationAction(ISD::ADD, MVT::v4i16, Custom);
+ setOperationAction(ISD::SUB, MVT::v4i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+
+ setOperationAction(ISD::FADD, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
+
+ setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
+ }
+
+ setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f16, Custom);
+
+ if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+ setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
} else {
+ // Legalization hack.
setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}
for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
@@ -503,6 +623,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);
@@ -540,16 +661,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
setSchedulingPreference(Sched::RegPressure);
+
+ // SI at least has hardware support for floating point exceptions, but no way
+ // of using or handling them is implemented. They are also optional in OpenCL
+ // (Section 7.3)
+ setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
}
-const SISubtarget *SITargetLowering::getSubtarget() const {
- return static_cast<const SISubtarget *>(Subtarget);
+const GCNSubtarget *SITargetLowering::getSubtarget() const {
+ return Subtarget;
}
//===----------------------------------------------------------------------===//
// TargetLowering queries
//===----------------------------------------------------------------------===//
+// v_mad_mix* support a conversion from f16 to f32.
+//
+// There is only one special case when denormals are enabled we don't currently,
+// where this is OK to use.
+bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
+ EVT DestVT, EVT SrcVT) const {
+ return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
+ (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
+ DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
+ SrcVT.getScalarType() == MVT::f16;
+}
+
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
// SI has some legal vector types, but no legal vector operations. Say no
// shuffles are legal in order to prefer scalarizing some vector operations.
@@ -560,9 +698,55 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
unsigned IntrID) const {
+ if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
+ AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+ AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
+ (Intrinsic::ID)IntrID);
+ if (Attr.hasFnAttribute(Attribute::ReadNone))
+ return false;
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ if (RsrcIntr->IsImage) {
+ Info.ptrVal = MFI->getImagePSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(RsrcIntr->RsrcArg));
+ Info.align = 0;
+ } else {
+ Info.ptrVal = MFI->getBufferPSV(
+ *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
+ CI.getArgOperand(RsrcIntr->RsrcArg));
+ }
+
+ Info.flags = MachineMemOperand::MODereferenceable;
+ if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.flags |= MachineMemOperand::MOLoad;
+ } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+ Info.flags |= MachineMemOperand::MOStore;
+ } else {
+ // Atomic
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.flags = MachineMemOperand::MOLoad |
+ MachineMemOperand::MOStore |
+ MachineMemOperand::MODereferenceable;
+
+ // XXX - Should this be volatile without known ordering?
+ Info.flags |= MachineMemOperand::MOVolatile;
+ }
+ return true;
+ }
+
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -575,6 +759,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+
default:
return false;
}
@@ -585,7 +770,10 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
Type *&AccessTy) const {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
Value *Ptr = II->getArgOperand(0);
AccessTy = II->getType();
Ops.push_back(Ptr);
@@ -675,7 +863,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AS == AMDGPUASI.GLOBAL_ADDRESS)
return isLegalGlobalAddressingMode(AM);
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -686,19 +875,19 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// will use a MUBUF load.
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
- if (DL.getTypeStoreSize(Ty) < 4)
+ if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
return isLegalGlobalAddressingMode(AM);
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
if (!isUInt<8>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
+ } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
// On CI+, this can also be a 32-bit literal constant offset. If it fits
// in 8-bits, it can use a smaller encoding.
if (!isUInt<32>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
@@ -798,7 +987,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
// If we have an uniform constant load, it still requires using a slow
// buffer instruction if unaligned.
if (IsFast) {
- *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
+ *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS ||
+ AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ?
(Align % 4 == 0) : true;
}
@@ -841,7 +1031,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
return AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS ||
- AS == AMDGPUASI.CONSTANT_ADDRESS;
+ AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT;
}
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -853,7 +1044,7 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
- const Instruction *I = dyn_cast<Instruction>(Ptr);
+ const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.noclobber");
}
@@ -870,7 +1061,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
- return AMDGPU::isUniformMMO(MemNode->getMemOperand());
+ return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
}
TargetLoweringBase::LegalizeTypeAction
@@ -932,14 +1123,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
- return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(Offset, SL, PtrVT));
+ return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
const SDLoc &SL) const {
- auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
- uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
+ FIRST_IMPLICIT);
return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
}
@@ -966,18 +1156,42 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
SDValue SITargetLowering::lowerKernargMemParameter(
SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- uint64_t Offset, bool Signed,
+ uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg) const {
- const DataLayout &DL = DAG.getDataLayout();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
- unsigned Align = DL.getABITypeAlignment(Ty);
+ // Try to avoid using an extload by loading earlier than the argument address,
+ // and extracting the relevant bits. The load should hopefully be merged with
+ // the previous argument.
+ if (MemVT.getStoreSize() < 4 && Align < 4) {
+ // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
+ int64_t AlignDownOffset = alignDown(Offset, 4);
+ int64_t OffsetDiff = Offset - AlignDownOffset;
+
+ EVT IntVT = MemVT.changeTypeToInteger();
+
+ // TODO: If we passed in the base kernel offset we could have a better
+ // alignment than 4, but we don't really need it.
+ SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
+ SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
+ MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant);
+
+ SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
+ SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
+
+ SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
+ ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
+ ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
+
+
+ return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
+ }
SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
- MachineMemOperand::MONonTemporal |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
@@ -1052,36 +1266,51 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
FunctionType *FType,
SIMachineFunctionInfo *Info) {
for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
- const ISD::InputArg &Arg = Ins[I];
+ const ISD::InputArg *Arg = &Ins[I];
// First check if it's a PS input addr.
- if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
- !Arg.Flags.isByVal() && PSInputNum <= 15) {
+ if (CallConv == CallingConv::AMDGPU_PS &&
+ !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
+
+ bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
+
+ // Inconveniently only the first part of the split is marked as isSplit,
+ // so skip to the end. We only want to increment PSInputNum once for the
+ // entire split argument.
+ if (Arg->Flags.isSplit()) {
+ while (!Arg->Flags.isSplitEnd()) {
+ assert(!Arg->VT.isVector() &&
+ "unexpected vector split in ps argument type");
+ if (!SkipArg)
+ Splits.push_back(*Arg);
+ Arg = &Ins[++I];
+ }
+ }
- if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
+ if (SkipArg) {
// We can safely skip PS inputs.
- Skipped.set(I);
+ Skipped.set(Arg->getOrigArgIndex());
++PSInputNum;
continue;
}
Info->markPSInputAllocated(PSInputNum);
- if (Arg.Used)
+ if (Arg->Used)
Info->markPSInputEnabled(PSInputNum);
++PSInputNum;
}
// Second split vertices into their elements.
- if (Arg.VT.isVector()) {
- ISD::InputArg NewArg = Arg;
+ if (Arg->VT.isVector()) {
+ ISD::InputArg NewArg = *Arg;
NewArg.Flags.setSplit();
- NewArg.VT = Arg.VT.getVectorElementType();
+ NewArg.VT = Arg->VT.getVectorElementType();
// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
// three or five element vertex only needs three or five registers,
// NOT four or eight.
- Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+ Type *ParamType = FType->getParamType(Arg->getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
for (unsigned J = 0; J != NumElements; ++J) {
@@ -1089,7 +1318,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
NewArg.PartOffset += NewArg.VT.getStoreSize();
}
} else {
- Splits.push_back(Arg);
+ Splits.push_back(*Arg);
}
}
}
@@ -1347,8 +1576,8 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// the scratch registers to pass in.
bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (ST.isAmdCodeObjectV2(MF)) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.isAmdCodeObjectV2(MF.getFunction())) {
if (RequiresStackAccess) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -1460,12 +1689,12 @@ SDValue SITargetLowering::LowerFormalArguments(
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
+ const Function &Fn = MF.getFunction();
FunctionType *FType = MF.getFunction().getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
- const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported NoGraphicsHSA(
Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
@@ -1562,9 +1791,16 @@ SDValue SITargetLowering::LowerFormalArguments(
SmallVector<SDValue, 16> Chains;
- for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+ // FIXME: This is the minimum kernel argument alignment. We should improve
+ // this to the maximum alignment of the arguments.
+ //
+ // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
+ // kern arg offset.
+ const unsigned KernelArgBaseAlign = 16;
+
+ for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
const ISD::InputArg &Arg = Ins[i];
- if (Skipped[i]) {
+ if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
InVals.push_back(DAG.getUNDEF(Arg.VT));
continue;
}
@@ -1576,19 +1812,16 @@ SDValue SITargetLowering::LowerFormalArguments(
VT = Ins[i].VT;
EVT MemVT = VA.getLocVT();
- const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
- VA.getLocMemOffset();
- Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+ const uint64_t Offset = VA.getLocMemOffset();
+ unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
- // The first 36 bytes of the input buffer contains information about
- // thread group and global sizes.
SDValue Arg = lowerKernargMemParameter(
- DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+ DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
@@ -1696,7 +1929,7 @@ SDValue SITargetLowering::LowerFormalArguments(
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
- ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());
+ ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
unsigned StackArgSize = CCInfo.getNextStackOffset();
Info->setBytesInStackArgArea(StackArgSize);
@@ -1841,8 +2074,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// FIXME: Does sret work properly?
if (!Info->isEntryFunction()) {
- const SIRegisterInfo *TRI
- = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
@@ -1944,8 +2176,7 @@ void SITargetLowering::passSpecialInputs(
SelectionDAG &DAG = CLI.DAG;
const SDLoc &DL = CLI.DL;
- const SISubtarget *ST = getSubtarget();
- const SIRegisterInfo *TRI = ST->getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
@@ -2138,6 +2369,13 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported required tail call to function ");
}
+ if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
+ // Note the issue is with the CC of the calling function, not of the call
+ // itself.
+ return lowerUnhandledCall(CLI, InVals,
+ "unsupported call from graphics shader of function ");
+ }
+
// The first 4 bytes are reserved for the callee's emergency stack slot.
const unsigned CalleeUsableStackOffset = 4;
@@ -2383,7 +2621,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// Add a register mask operand representing the call-preserved registers.
- const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
+ auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -2443,7 +2681,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
}
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
report_fatal_error(Twine("invalid register \""
+ StringRef(RegName) + "\" for subtarget."));
@@ -2517,7 +2755,8 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
unsigned PhiReg,
unsigned InitSaveExecReg,
int Offset,
- bool UseGPRIdxMode) {
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
MachineBasicBlock::iterator I = LoopBB.begin();
unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
@@ -2546,6 +2785,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg)
.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
+ // Update EXEC, save the original EXEC value to VCC.
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
+ .addReg(CondReg, RegState::Kill);
+
+ MRI.setSimpleHint(NewExec, CondReg);
+
if (UseGPRIdxMode) {
unsigned IdxReg;
if (Offset == 0) {
@@ -2556,11 +2801,13 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
.addReg(CurrentIdxReg, RegState::Kill)
.addImm(Offset);
}
-
- MachineInstr *SetIdx =
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
- .addReg(IdxReg, RegState::Kill);
- SetIdx->getOperand(2).setIsUndef();
+ unsigned IdxMode = IsIndirectSrc ?
+ VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
+ MachineInstr *SetOn =
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
+ .addReg(IdxReg, RegState::Kill)
+ .addImm(IdxMode);
+ SetOn->getOperand(3).setIsUndef();
} else {
// Move index from VCC into M0
if (Offset == 0) {
@@ -2573,12 +2820,6 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
}
}
- // Update EXEC, save the original EXEC value to VCC.
- BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
- .addReg(CondReg, RegState::Kill);
-
- MRI.setSimpleHint(NewExec, CondReg);
-
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
MachineInstr *InsertPt =
BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
@@ -2606,7 +2847,8 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
unsigned InitResultReg,
unsigned PhiReg,
int Offset,
- bool UseGPRIdxMode) {
+ bool UseGPRIdxMode,
+ bool IsIndirectSrc) {
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
@@ -2645,7 +2887,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
InitResultReg, DstReg, PhiReg, TmpExec,
- Offset, UseGPRIdxMode);
+ Offset, UseGPRIdxMode, IsIndirectSrc);
MachineBasicBlock::iterator First = RemainderBB->begin();
BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
@@ -2730,7 +2972,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
// Control flow needs to be inserted if indexing with a VGPR.
static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SISubtarget &ST) {
+ const GCNSubtarget &ST) {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
@@ -2780,17 +3022,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
- if (UseGPRIdxMode) {
- MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addImm(0) // Reset inside loop.
- .addImm(VGPRIndexMode::SRC0_ENABLE);
- SetOn->getOperand(3).setIsUndef();
-
- // Disable again after the loop.
- BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- }
-
- auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
+ Offset, UseGPRIdxMode, true);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@@ -2798,6 +3031,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
.addReg(SrcReg, RegState::Undef, SubReg)
.addReg(SrcReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(SrcReg, RegState::Undef, SubReg)
@@ -2829,7 +3063,7 @@ static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
MachineBasicBlock &MBB,
- const SISubtarget &ST) {
+ const GCNSubtarget &ST) {
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineFunction *MF = MBB.getParent();
@@ -2898,22 +3132,10 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
- if (UseGPRIdxMode) {
- MachineBasicBlock::iterator I(&MI);
-
- MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
- .addImm(0) // Reset inside loop.
- .addImm(VGPRIndexMode::DST_ENABLE);
- SetOn->getOperand(3).setIsUndef();
-
- // Disable again after the loop.
- BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
- }
-
unsigned PhiReg = MRI.createVirtualRegister(VecRC);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
- Offset, UseGPRIdxMode);
+ Offset, UseGPRIdxMode, false);
MachineBasicBlock *LoopBB = InsPt->getParent();
if (UseGPRIdxMode) {
@@ -2923,6 +3145,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
.addReg(Dst, RegState::ImplicitDefine)
.addReg(PhiReg, RegState::Implicit)
.addReg(AMDGPU::M0, RegState::Implicit);
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
} else {
const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
@@ -2946,24 +3169,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
if (TII->isMIMG(MI)) {
- if (!MI.memoperands_empty())
- return BB;
+ if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
+ report_fatal_error("missing mem operand from MIMG instruction");
+ }
// Add a memoperand for mimg instructions so that they aren't assumed to
// be ordered memory instuctions.
- MachinePointerInfo PtrInfo(MFI->getImagePSV());
- MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
- if (MI.mayStore())
- Flags |= MachineMemOperand::MOStore;
-
- if (MI.mayLoad())
- Flags |= MachineMemOperand::MOLoad;
-
- if (Flags != MachineMemOperand::MODereferenceable) {
- auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
- MI.addMemOperand(*MF, MMO);
- }
-
return BB;
}
@@ -3145,8 +3356,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::ADJCALLSTACKDOWN: {
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
MachineInstrBuilder MIB(*MF, &MI);
+
+ // Add an implicit use of the frame offset reg to prevent the restore copy
+ // inserted after the call from being reorderd after stack operations in the
+ // the caller's frame.
MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
- .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
+ .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
+ .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
return BB;
}
case AMDGPU::SI_CALL_ISEL:
@@ -3236,12 +3452,17 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
switch (VT.getSimpleVT().SimpleTy) {
- case MVT::f32:
+ case MVT::f32: {
// This is as fast on some subtargets. However, we always have full rate f32
// mad available which returns the same result as the separate operations
// which we should prefer over fma. We can't use this if we want to support
// denormals, so only report this in these cases.
- return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
+ if (Subtarget->hasFP32Denormals())
+ return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
+
+ // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
+ return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
+ }
case MVT::f64:
return true;
case MVT::f16:
@@ -3257,6 +3478,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4f16);
+
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+
+ SDLoc SL(Op);
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
+// wider vector type is legal.
+SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+ SDValue Lo0, Hi0;
+ std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ SDValue Lo1, Hi1;
+ std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+
+ SDLoc SL(Op);
+
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3289,15 +3553,105 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);
case ISD::TRAP:
- case ISD::DEBUGTRAP:
return lowerTRAP(Op, DAG);
+ case ISD::DEBUGTRAP:
+ return lowerDEBUGTRAP(Op, DAG);
+ case ISD::FABS:
+ case ISD::FNEG:
+ return splitUnaryVectorOp(Op, DAG);
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::MUL:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case ISD::FADD:
+ case ISD::FMUL:
+ return splitBinaryVectorOp(Op, DAG);
}
return SDValue();
}
+static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
+ const SDLoc &DL,
+ SelectionDAG &DAG, bool Unpacked) {
+ if (!LoadVT.isVector())
+ return Result;
+
+ if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
+ // Truncate to v2i16/v4i16.
+ EVT IntLoadVT = LoadVT.changeTypeToInteger();
+
+ // Workaround legalizer not scalarizing truncate after vector op
+ // legalization byt not creating intermediate vector trunc.
+ SmallVector<SDValue, 4> Elts;
+ DAG.ExtractVectorElements(Result, Elts);
+ for (SDValue &Elt : Elts)
+ Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
+
+ Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
+
+ // Bitcast to original type (v2f16/v4f16).
+ return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+ }
+
+ // Cast back to the original packed type.
+ return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
+}
+
+SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
+ MemSDNode *M,
+ SelectionDAG &DAG,
+ bool IsIntrinsic) const {
+ SDLoc DL(M);
+ SmallVector<SDValue, 10> Ops;
+ Ops.reserve(M->getNumOperands());
+
+ Ops.push_back(M->getOperand(0));
+ if (IsIntrinsic)
+ Ops.push_back(DAG.getConstant(Opcode, DL, MVT::i32));
+
+ // Skip 1, as it is the intrinsic ID.
+ for (unsigned I = 2, E = M->getNumOperands(); I != E; ++I)
+ Ops.push_back(M->getOperand(I));
+
+ bool Unpacked = Subtarget->hasUnpackedD16VMem();
+ EVT LoadVT = M->getValueType(0);
+
+ EVT EquivLoadVT = LoadVT;
+ if (Unpacked && LoadVT.isVector()) {
+ EquivLoadVT = LoadVT.isVector() ?
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ LoadVT.getVectorNumElements()) : LoadVT;
+ }
+
+ // Change from v4f16/v2f16 to EquivLoadVT.
+ SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
+
+ SDValue Load
+ = DAG.getMemIntrinsicNode(
+ IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
+ VTList, Ops, M->getMemoryVT(),
+ M->getMemOperand());
+ if (!Unpacked) // Just adjusted the opcode.
+ return Load;
+
+ SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
+
+ return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
+}
+
void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
@@ -3314,7 +3668,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
- if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
+ switch (IID) {
+ case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
SDLoc SL(N);
@@ -3323,6 +3678,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
return;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ SDValue Src0 = N->getOperand(1);
+ SDValue Src1 = N->getOperand(2);
+ SDLoc SL(N);
+ unsigned Opcode;
+
+ if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
+ return;
+ }
+ }
+ break;
+ }
+ case ISD::INTRINSIC_W_CHAIN: {
+ if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+
break;
}
case ISD::SELECT: {
@@ -3347,12 +3734,38 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
return;
}
+ case ISD::FNEG: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
+ SDLoc SL(N);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x80008000, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
+ case ISD::FABS: {
+ if (N->getValueType(0) != MVT::v2f16)
+ break;
+
+ SDLoc SL(N);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
+
+ SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
+ BC,
+ DAG.getConstant(0x7fff7fff, SL, MVT::i32));
+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
+ return;
+ }
default:
break;
}
}
-/// \brief Helper function for LowerBRCOND
+/// Helper function for LowerBRCOND
static SDNode *findUser(SDValue Value, unsigned Opcode) {
SDNode *Parent = Value.getNode();
@@ -3417,13 +3830,15 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
- return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
+ return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
AMDGPU::shouldEmitConstantsToTextSection(TT);
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
}
@@ -3560,40 +3975,37 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
- MachineFunction &MF = DAG.getMachineFunction();
SDValue Chain = Op.getOperand(0);
- unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
- SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
-
- if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
- Subtarget->isTrapHandlerEnabled()) {
- SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- unsigned UserSGPR = Info->getQueuePtrUserSGPR();
- assert(UserSGPR != AMDGPU::NoRegister);
-
- SDValue QueuePtr = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
-
- SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
-
- SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
- QueuePtr, SDValue());
+ if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !Subtarget->isTrapHandlerEnabled())
+ return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
- SDValue Ops[] = {
- ToReg,
- DAG.getTargetConstant(TrapID, SL, MVT::i16),
- SGPR01,
- ToReg.getValue(1)
- };
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ assert(UserSGPR != AMDGPU::NoRegister);
+ SDValue QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+ SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
+ SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
+ QueuePtr, SDValue());
+ SDValue Ops[] = {
+ ToReg,
+ DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
+ SGPR01,
+ ToReg.getValue(1)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
+}
- return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
- }
+SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ SDValue Chain = Op.getOperand(0);
+ MachineFunction &MF = DAG.getMachineFunction();
- switch (TrapID) {
- case SISubtarget::TrapIDLLVMTrap:
- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
- case SISubtarget::TrapIDLLVMDebugTrap: {
+ if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+ !Subtarget->isTrapHandlerEnabled()) {
DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
@@ -3602,11 +4014,12 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
Ctx.diagnose(NoTrap);
return Chain;
}
- default:
- llvm_unreachable("unsupported trap handler type!");
- }
- return Chain;
+ SDValue Ops[] = {
+ Chain,
+ DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
+ };
+ return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
@@ -3719,34 +4132,78 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
+ SDValue Vec = Op.getOperand(0);
+ SDValue InsVal = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+
+ assert(VecSize <= 64);
+
+ unsigned NumElts = VecVT.getVectorNumElements();
+ SDLoc SL(Op);
+ auto KIdx = dyn_cast<ConstantSDNode>(Idx);
+
+ if (NumElts == 4 && EltSize == 16 && KIdx) {
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
+
+ SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(0, SL, MVT::i32));
+ SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
+ DAG.getConstant(1, SL, MVT::i32));
+
+ SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
+ SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
+
+ unsigned Idx = KIdx->getZExtValue();
+ bool InsertLo = Idx < 2;
+ SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
+ InsertLo ? LoVec : HiVec,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
+ DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
+
+ InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
+
+ SDValue Concat = InsertLo ?
+ DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
+ DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
+
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
+ }
+
if (isa<ConstantSDNode>(Idx))
return SDValue();
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+
// Avoid stack access for dynamic indexing.
- SDLoc SL(Op);
- SDValue Vec = Op.getOperand(0);
- SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
+ SDValue Val = InsVal;
+ if (InsVal.getValueType() == MVT::f16)
+ Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
- SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
+ SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
- // Convert vector index to bit-index.
- SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
- DAG.getConstant(16, SL, MVT::i32));
+ assert(isPowerOf2_32(EltSize));
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
- SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+ // Convert vector index to bit-index.
+ SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
- SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
- DAG.getConstant(0xffff, SL, MVT::i32),
+ SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
+ DAG.getConstant(0xffff, SL, IntVT),
ScaledIdx);
- SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
- SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
- DAG.getNOT(SL, BFM, MVT::i32), BCVec);
+ SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
+ SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
+ DAG.getNOT(SL, BFM, IntVT), BCVec);
- SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
- return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
+ SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}
SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
@@ -3756,51 +4213,87 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
EVT ResultVT = Op.getValueType();
SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);
+ EVT VecVT = Vec.getValueType();
+ unsigned VecSize = VecVT.getSizeInBits();
+ EVT EltVT = VecVT.getVectorElementType();
+ assert(VecSize <= 64);
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
- // Make sure we we do any optimizations that will make it easier to fold
+ // Make sure we do any optimizations that will make it easier to fold
// source modifiers before obscuring it with bit operations.
// XXX - Why doesn't this get called when vector_shuffle is expanded?
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
- SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
+ unsigned EltSize = EltVT.getSizeInBits();
+ assert(isPowerOf2_32(EltSize));
- if (CIdx->getZExtValue() == 1) {
- Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
- DAG.getConstant(16, SL, MVT::i32));
- } else {
- assert(CIdx->getZExtValue() == 0);
- }
+ MVT IntVT = MVT::getIntegerVT(VecSize);
+ SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
+
+ // Convert vector index to bit-index (* EltSize)
+ SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
- if (ResultVT.bitsLT(MVT::i32))
- Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
+ SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
+
+ if (ResultVT == MVT::f16) {
+ SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}
- SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
+ return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
+}
- // Convert vector index to bit-index.
- SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
+SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+ EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+
+ // Turn into pair of packed build_vectors.
+ // TODO: Special case for constants that can be materialized with s_mov_b64.
+ SDValue Lo = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(0), Op.getOperand(1) });
+ SDValue Hi = DAG.getBuildVector(HalfVT, SL,
+ { Op.getOperand(2), Op.getOperand(3) });
+
+ SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
+ SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
+
+ SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
- SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
- SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
+ assert(VT == MVT::v2f16 || VT == MVT::v2i16);
- SDValue Result = Elt;
- if (ResultVT.bitsLT(MVT::i32))
- Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
+ SDValue Lo = Op.getOperand(0);
+ SDValue Hi = Op.getOperand(1);
- return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
+ Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
+ Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
+
+ Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
+ Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
+
+ SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
+ DAG.getConstant(16, SL, MVT::i32));
+
+ SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
+
+ return DAG.getNode(ISD::BITCAST, SL, VT, Or);
}
bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// We can fold offsets for anything that doesn't require a GOT relocation.
return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
- GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
+ GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS ||
+ GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitGOTReloc(GA->getGlobal());
}
@@ -3853,6 +4346,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+ GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
// FIXME: It isn't correct to rely on the type of the pointer. This should
// be removed when address space 0 is 64-bit.
@@ -3905,7 +4399,7 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
unsigned Offset) const {
SDLoc SL(Op);
SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
- DAG.getEntryNode(), Offset, false);
+ DAG.getEntryNode(), Offset, 4, false);
// The local size values will have the hi 16-bits as zero.
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
DAG.getValueType(VT));
@@ -3929,6 +4423,245 @@ static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
return DAG.getUNDEF(VT);
}
+static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
+ ArrayRef<SDValue> Elts) {
+ assert(!Elts.empty());
+ MVT Type;
+ unsigned NumElts;
+
+ if (Elts.size() == 1) {
+ Type = MVT::f32;
+ NumElts = 1;
+ } else if (Elts.size() == 2) {
+ Type = MVT::v2f32;
+ NumElts = 2;
+ } else if (Elts.size() <= 4) {
+ Type = MVT::v4f32;
+ NumElts = 4;
+ } else if (Elts.size() <= 8) {
+ Type = MVT::v8f32;
+ NumElts = 8;
+ } else {
+ assert(Elts.size() <= 16);
+ Type = MVT::v16f32;
+ NumElts = 16;
+ }
+
+ SmallVector<SDValue, 16> VecElts(NumElts);
+ for (unsigned i = 0; i < Elts.size(); ++i) {
+ SDValue Elt = Elts[i];
+ if (Elt.getValueType() != MVT::f32)
+ Elt = DAG.getBitcast(MVT::f32, Elt);
+ VecElts[i] = Elt;
+ }
+ for (unsigned i = Elts.size(); i < NumElts; ++i)
+ VecElts[i] = DAG.getUNDEF(MVT::f32);
+
+ if (NumElts == 1)
+ return VecElts[0];
+ return DAG.getBuildVector(Type, DL, VecElts);
+}
+
+static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
+ SDValue *GLC, SDValue *SLC) {
+ auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode());
+ if (!CachePolicyConst)
+ return false;
+
+ uint64_t Value = CachePolicyConst->getZExtValue();
+ SDLoc DL(CachePolicy);
+ if (GLC) {
+ *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x1;
+ }
+ if (SLC) {
+ *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
+ Value &= ~(uint64_t)0x2;
+ }
+
+ return Value == 0;
+}
+
+SDValue SITargetLowering::lowerImage(SDValue Op,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+ const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+
+ SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
+ bool IsD16 = false;
+ SDValue VData;
+ int NumVDataDwords;
+ unsigned AddrIdx; // Index of first address argument
+ unsigned DMask;
+
+ if (BaseOpcode->Atomic) {
+ VData = Op.getOperand(2);
+
+ bool Is64Bit = VData.getValueType() == MVT::i64;
+ if (BaseOpcode->AtomicX2) {
+ SDValue VData2 = Op.getOperand(3);
+ VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
+ {VData, VData2});
+ if (Is64Bit)
+ VData = DAG.getBitcast(MVT::v4i32, VData);
+
+ ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
+ DMask = Is64Bit ? 0xf : 0x3;
+ NumVDataDwords = Is64Bit ? 4 : 2;
+ AddrIdx = 4;
+ } else {
+ DMask = Is64Bit ? 0x3 : 0x1;
+ NumVDataDwords = Is64Bit ? 2 : 1;
+ AddrIdx = 3;
+ }
+ } else {
+ unsigned DMaskIdx;
+
+ if (BaseOpcode->Store) {
+ VData = Op.getOperand(2);
+
+ MVT StoreVT = VData.getSimpleValueType();
+ if (StoreVT.getScalarType() == MVT::f16) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+ !BaseOpcode->HasD16)
+ return Op; // D16 is unsupported for this instruction
+
+ IsD16 = true;
+ VData = handleD16VData(VData, DAG);
+ }
+
+ NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
+ DMaskIdx = 3;
+ } else {
+ MVT LoadVT = Op.getSimpleValueType();
+ if (LoadVT.getScalarType() == MVT::f16) {
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
+ !BaseOpcode->HasD16)
+ return Op; // D16 is unsupported for this instruction
+
+ IsD16 = true;
+ if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
+ ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
+ }
+
+ NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
+ DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
+ }
+
+ auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
+ if (!DMaskConst)
+ return Op;
+
+ AddrIdx = DMaskIdx + 1;
+ DMask = DMaskConst->getZExtValue();
+ if (!DMask && !BaseOpcode->Store) {
+ // Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
+ // store the channels' default values.
+ SDValue Undef = DAG.getUNDEF(Op.getValueType());
+ if (isa<MemSDNode>(Op))
+ return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
+ return Undef;
+ }
+ }
+
+ unsigned NumVAddrs = BaseOpcode->NumExtraArgs +
+ (BaseOpcode->Gradients ? DimInfo->NumGradients : 0) +
+ (BaseOpcode->Coordinates ? DimInfo->NumCoords : 0) +
+ (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+ SmallVector<SDValue, 4> VAddrs;
+ for (unsigned i = 0; i < NumVAddrs; ++i)
+ VAddrs.push_back(Op.getOperand(AddrIdx + i));
+ SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+
+ SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
+ SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
+ unsigned CtrlIdx; // Index of texfailctrl argument
+ SDValue Unorm;
+ if (!BaseOpcode->Sampler) {
+ Unorm = True;
+ CtrlIdx = AddrIdx + NumVAddrs + 1;
+ } else {
+ auto UnormConst =
+ dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
+ if (!UnormConst)
+ return Op;
+
+ Unorm = UnormConst->getZExtValue() ? True : False;
+ CtrlIdx = AddrIdx + NumVAddrs + 3;
+ }
+
+ SDValue TexFail = Op.getOperand(CtrlIdx);
+ auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
+ if (!TexFailConst || TexFailConst->getZExtValue() != 0)
+ return Op;
+
+ SDValue GLC;
+ SDValue SLC;
+ if (BaseOpcode->Atomic) {
+ GLC = True; // TODO no-return optimization
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC))
+ return Op;
+ } else {
+ if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC))
+ return Op;
+ }
+
+ SmallVector<SDValue, 14> Ops;
+ if (BaseOpcode->Store || BaseOpcode->Atomic)
+ Ops.push_back(VData); // vdata
+ Ops.push_back(VAddr);
+ Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
+ if (BaseOpcode->Sampler)
+ Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
+ Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
+ Ops.push_back(Unorm);
+ Ops.push_back(GLC);
+ Ops.push_back(SLC);
+ Ops.push_back(False); // r128
+ Ops.push_back(False); // tfe
+ Ops.push_back(False); // lwe
+ Ops.push_back(DimInfo->DA ? True : False);
+ if (BaseOpcode->HasD16)
+ Ops.push_back(IsD16 ? True : False);
+ if (isa<MemSDNode>(Op))
+ Ops.push_back(Op.getOperand(0)); // chain
+
+ int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32;
+ int Opcode = -1;
+
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx8,
+ NumVDataDwords, NumVAddrDwords);
+ if (Opcode == -1)
+ Opcode = AMDGPU::getMIMGOpcode(Intr->BaseOpcode, AMDGPU::MIMGEncGfx6,
+ NumVDataDwords, NumVAddrDwords);
+ assert(Opcode != -1);
+
+ MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
+ if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
+ MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+ *MemRefs = MemOp->getMemOperand();
+ NewNode->setMemRefs(MemRefs, MemRefs + 1);
+ }
+
+ if (BaseOpcode->AtomicX2) {
+ SmallVector<SDValue, 1> Elt;
+ DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
+ return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
+ } else if (IsD16 && !BaseOpcode->Store) {
+ MVT LoadVT = Op.getSimpleValueType();
+ SDValue Adjusted = adjustLoadValueTypeImpl(
+ SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
+ return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
+ }
+
+ return SDValue(NewNode, 0);
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -3942,14 +4675,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_implicit_buffer_ptr: {
- if (getSubtarget()->isAmdCodeObjectV2(MF))
+ if (getSubtarget()->isAmdCodeObjectV2(MF.getFunction()))
return emitNonHSAIntrinsicError(DAG, DL, VT);
return getPreloadedValue(DAG, *MFI, VT,
AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
}
case Intrinsic::amdgcn_dispatch_ptr:
case Intrinsic::amdgcn_queue_ptr: {
- if (!Subtarget->isAmdCodeObjectV2(MF)) {
+ if (!Subtarget->isAmdCodeObjectV2(MF.getFunction())) {
DiagnosticInfoUnsupported BadIntrin(
MF.getFunction(), "unsupported hsa intrinsic without hsa target",
DL.getDebugLoc());
@@ -3979,16 +4712,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_rsq:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rsq_legacy:
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rcp_legacy:
- if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
return emitRemovedIntrinsicError(DAG, DL, VT);
return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_rsq_clamp: {
- if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
Type *Type = VT.getTypeForEVT(*DAG.getContext());
@@ -4006,37 +4739,37 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_X, false);
+ SI::KernelInputOffsets::NGROUPS_X, 4, false);
case Intrinsic::r600_read_ngroups_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Y, false);
+ SI::KernelInputOffsets::NGROUPS_Y, 4, false);
case Intrinsic::r600_read_ngroups_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::NGROUPS_Z, false);
+ SI::KernelInputOffsets::NGROUPS_Z, 4, false);
case Intrinsic::r600_read_global_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
case Intrinsic::r600_read_global_size_y:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
case Intrinsic::r600_read_global_size_z:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
- SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+ SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
case Intrinsic::r600_read_local_size_x:
if (Subtarget->isAmdHsaOS())
return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -4125,7 +4858,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
case Intrinsic::amdgcn_log_clamp: {
- if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();
DiagnosticInfoUnsupported BadIntrin(
@@ -4210,6 +4943,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fmed3:
return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::amdgcn_fdot2:
+ return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
case Intrinsic::amdgcn_fmul_legacy:
return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
Op.getOperand(1), Op.getOperand(2));
@@ -4221,10 +4957,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ubfe:
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- case Intrinsic::amdgcn_cvt_pkrtz: {
- // FIXME: Stop adding cast if v2f16 legal.
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ // FIXME: Stop adding cast if v2f16/v2i16 are legal.
EVT VT = Op.getValueType();
- SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+ unsigned Opcode;
+
+ if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
+ Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
+ else
+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
+
+ SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
@@ -4238,17 +4991,14 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
0);
}
- case Intrinsic::amdgcn_image_getlod:
- case Intrinsic::amdgcn_image_getresinfo: {
- unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;
-
- // Replace dmask with everything disabled with undef.
- const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
- if (!DMask || DMask->isNullValue())
- return DAG.getUNDEF(Op.getValueType());
- return SDValue();
- }
+ case Intrinsic::amdgcn_fmad_ftz:
+ return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
default:
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+ return lowerImage(Op, ImageDimIntr, DAG);
+
return Op;
}
}
@@ -4257,14 +5007,34 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
SDLoc DL(Op);
- MachineFunction &MF = DAG.getMachineFunction();
switch (IntrID) {
case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec: {
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
- unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
- AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+ unsigned Opc;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ Opc = AMDGPUISD::ATOMIC_INC;
+ break;
+ case Intrinsic::amdgcn_atomic_dec:
+ Opc = AMDGPUISD::ATOMIC_DEC;
+ break;
+ case Intrinsic::amdgcn_ds_fadd:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FADD;
+ break;
+ case Intrinsic::amdgcn_ds_fmin:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
+ break;
+ case Intrinsic::amdgcn_ds_fmax:
+ Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
+ break;
+ default:
+ llvm_unreachable("Unknown intrinsic!");
+ }
SDValue Ops[] = {
M->getOperand(0), // Chain
M->getOperand(2), // Ptr
@@ -4284,21 +5054,28 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // glc
Op.getOperand(6) // slc
};
- SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
+ auto *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+ if (IsD16)
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG);
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(MFI->getBufferPSV()),
- MachineMemOperand::MOLoad,
- VT.getStoreSize(), VT.getStoreSize());
-
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_tbuffer_load: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ EVT LoadVT = Op.getValueType();
+ bool IsD16 = LoadVT.getScalarType() == MVT::f16;
+ if (IsD16) {
+ return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG);
+ }
+
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // rsrc
@@ -4312,14 +5089,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(10) // slc
};
- EVT VT = Op.getOperand(2).getValueType();
-
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad,
- VT.getStoreSize(), VT.getStoreSize());
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
+ Op->getVTList(), Ops, LoadVT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_atomic_swap:
case Intrinsic::amdgcn_buffer_atomic_add:
@@ -4339,14 +5111,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // offset
Op.getOperand(6) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile,
- VT.getStoreSize(), 4);
+ EVT VT = Op.getValueType();
+
+ auto *M = cast<MemSDNode>(Op);
unsigned Opcode = 0;
switch (IntrID) {
@@ -4384,7 +5151,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
llvm_unreachable("unhandled atomic opcode");
}
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
@@ -4397,78 +5165,46 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(6), // offset
Op.getOperand(7) // slc
};
- EVT VT = Op.getOperand(4).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable |
- MachineMemOperand::MOVolatile,
- VT.getStoreSize(), 4);
+ EVT VT = Op.getValueType();
+ auto *M = cast<MemSDNode>(Op);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
- Op->getVTList(), Ops, VT, MMO);
+ Op->getVTList(), Ops, VT, M->getMemOperand());
}
- // Basic sample.
- case Intrinsic::amdgcn_image_sample:
- case Intrinsic::amdgcn_image_sample_cl:
- case Intrinsic::amdgcn_image_sample_d:
- case Intrinsic::amdgcn_image_sample_d_cl:
- case Intrinsic::amdgcn_image_sample_l:
- case Intrinsic::amdgcn_image_sample_b:
- case Intrinsic::amdgcn_image_sample_b_cl:
- case Intrinsic::amdgcn_image_sample_lz:
- case Intrinsic::amdgcn_image_sample_cd:
- case Intrinsic::amdgcn_image_sample_cd_cl:
-
- // Sample with comparison.
- case Intrinsic::amdgcn_image_sample_c:
- case Intrinsic::amdgcn_image_sample_c_cl:
- case Intrinsic::amdgcn_image_sample_c_d:
- case Intrinsic::amdgcn_image_sample_c_d_cl:
- case Intrinsic::amdgcn_image_sample_c_l:
- case Intrinsic::amdgcn_image_sample_c_b:
- case Intrinsic::amdgcn_image_sample_c_b_cl:
- case Intrinsic::amdgcn_image_sample_c_lz:
- case Intrinsic::amdgcn_image_sample_c_cd:
- case Intrinsic::amdgcn_image_sample_c_cd_cl:
-
- // Sample with offsets.
- case Intrinsic::amdgcn_image_sample_o:
- case Intrinsic::amdgcn_image_sample_cl_o:
- case Intrinsic::amdgcn_image_sample_d_o:
- case Intrinsic::amdgcn_image_sample_d_cl_o:
- case Intrinsic::amdgcn_image_sample_l_o:
- case Intrinsic::amdgcn_image_sample_b_o:
- case Intrinsic::amdgcn_image_sample_b_cl_o:
- case Intrinsic::amdgcn_image_sample_lz_o:
- case Intrinsic::amdgcn_image_sample_cd_o:
- case Intrinsic::amdgcn_image_sample_cd_cl_o:
-
- // Sample with comparison and offsets.
- case Intrinsic::amdgcn_image_sample_c_o:
- case Intrinsic::amdgcn_image_sample_c_cl_o:
- case Intrinsic::amdgcn_image_sample_c_d_o:
- case Intrinsic::amdgcn_image_sample_c_d_cl_o:
- case Intrinsic::amdgcn_image_sample_c_l_o:
- case Intrinsic::amdgcn_image_sample_c_b_o:
- case Intrinsic::amdgcn_image_sample_c_b_cl_o:
- case Intrinsic::amdgcn_image_sample_c_lz_o:
- case Intrinsic::amdgcn_image_sample_c_cd_o:
- case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
- // Replace dmask with everything disabled with undef.
- const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
- if (!DMask || DMask->isNullValue()) {
- SDValue Undef = DAG.getUNDEF(Op.getValueType());
- return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
- }
+ default:
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrID))
+ return lowerImage(Op, ImageDimIntr, DAG);
return SDValue();
}
- default:
- return SDValue();
+}
+
+SDValue SITargetLowering::handleD16VData(SDValue VData,
+ SelectionDAG &DAG) const {
+ EVT StoreVT = VData.getValueType();
+
+ // No change for f16 and legal vector D16 types.
+ if (!StoreVT.isVector())
+ return VData;
+
+ SDLoc DL(VData);
+ assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
+
+ if (Subtarget->hasUnpackedD16VMem()) {
+ // We need to unpack the packed data to store.
+ EVT IntStoreVT = StoreVT.changeTypeToInteger();
+ SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
+
+ EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ StoreVT.getVectorNumElements());
+ SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
+ return DAG.UnrollVectorOp(ZExt.getNode());
}
+
+ assert(isTypeLegal(StoreVT));
+ return VData;
}
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
@@ -4558,7 +5294,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
@@ -4613,9 +5349,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_tbuffer_store: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
SDValue Ops[] = {
Chain,
- Op.getOperand(2), // vdata
+ VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
Op.getOperand(5), // voffset
@@ -4626,42 +5366,133 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(10), // glc
Op.getOperand(11) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore,
- VT.getStoreSize(), 4);
- return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
- Op->getVTList(), Ops, VT, MMO);
+ unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
+ AMDGPUISD::TBUFFER_STORE_FORMAT;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_buffer_store:
case Intrinsic::amdgcn_buffer_store_format: {
+ SDValue VData = Op.getOperand(2);
+ bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ if (IsD16)
+ VData = handleD16VData(VData, DAG);
SDValue Ops[] = {
Chain,
- Op.getOperand(2), // vdata
+ VData, // vdata
Op.getOperand(3), // rsrc
Op.getOperand(4), // vindex
Op.getOperand(5), // offset
Op.getOperand(6), // glc
Op.getOperand(7) // slc
};
- EVT VT = Op.getOperand(3).getValueType();
- MachineMemOperand *MMO = MF.getMachineMemOperand(
- MachinePointerInfo(),
- MachineMemOperand::MOStore |
- MachineMemOperand::MODereferenceable,
- VT.getStoreSize(), 4);
+ unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
+ AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
+ MemSDNode *M = cast<MemSDNode>(Op);
+ return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+ default: {
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
+ return lowerImage(Op, ImageDimIntr, DAG);
- unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
- AMDGPUISD::BUFFER_STORE :
- AMDGPUISD::BUFFER_STORE_FORMAT;
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
+ return Op;
}
+ }
+}
- default:
+static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
+ ISD::LoadExtType ExtType, SDValue Op,
+ const SDLoc &SL, EVT VT) {
+ if (VT.bitsLT(Op.getValueType()))
+ return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
+
+ switch (ExtType) {
+ case ISD::SEXTLOAD:
+ return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
+ case ISD::ZEXTLOAD:
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
+ case ISD::EXTLOAD:
+ return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
+ case ISD::NON_EXTLOAD:
return Op;
}
+
+ llvm_unreachable("invalid ext type");
+}
+
+SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ if (Ld->getAlignment() < 4 || Ld->isDivergent())
+ return SDValue();
+
+ // FIXME: Constant loads should all be marked invariant.
+ unsigned AS = Ld->getAddressSpace();
+ if (AS != AMDGPUASI.CONSTANT_ADDRESS &&
+ AS != AMDGPUASI.CONSTANT_ADDRESS_32BIT &&
+ (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
+ return SDValue();
+
+ // Don't do this early, since it may interfere with adjacent load merging for
+ // illegal types. We can avoid losing alignment information for exotic types
+ // pre-legalize.
+ EVT MemVT = Ld->getMemoryVT();
+ if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
+ MemVT.getSizeInBits() >= 32)
+ return SDValue();
+
+ SDLoc SL(Ld);
+
+ assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
+ "unexpected vector extload");
+
+ // TODO: Drop only high part of range.
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
+ MVT::i32, SL, Ld->getChain(), Ptr,
+ Ld->getOffset(),
+ Ld->getPointerInfo(), MVT::i32,
+ Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags(),
+ Ld->getAAInfo(),
+ nullptr); // Drop ranges
+
+ EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+ if (MemVT.isFloatingPoint()) {
+ assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
+ "unexpected fp extload");
+ TruncVT = MemVT.changeTypeToInteger();
+ }
+
+ SDValue Cvt = NewLoad;
+ if (Ld->getExtensionType() == ISD::SEXTLOAD) {
+ Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
+ DAG.getValueType(TruncVT));
+ } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
+ Ld->getExtensionType() == ISD::NON_EXTLOAD) {
+ Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
+ } else {
+ assert(Ld->getExtensionType() == ISD::EXTLOAD);
+ }
+
+ EVT VT = Ld->getValueType(0);
+ EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // We may need to handle exotic cases, such as i16->i64 extloads, so insert
+ // the appropriate extension from the 32-bit load.
+ Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
+ DCI.AddToWorklist(Cvt.getNode());
+
+ // Handle conversion back to floating point if necessary.
+ Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
+
+ return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
}
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -4700,9 +5531,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
+ unsigned Alignment = Load->getAlignment();
unsigned AS = Load->getAddressSpace();
if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- AS, Load->getAlignment())) {
+ AS, Alignment)) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
@@ -4717,24 +5549,32 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
unsigned NumElements = MemVT.getVectorNumElements();
- if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
- if (isMemOpUniform(Load))
+
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) {
+ if (!Op->isDivergent() && Alignment >= 4)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
- if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
- !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
+
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS) {
+ if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
+ !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+ Alignment >= 4)
return SDValue();
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requirements as global and private
// loads.
//
}
- if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+ if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+ AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT ||
+ AS == AMDGPUASI.GLOBAL_ADDRESS ||
AS == AMDGPUASI.FLAT_ADDRESS) {
if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
@@ -4761,21 +5601,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
- if (NumElements > 2)
- return SplitVectorLoad(Op, DAG);
-
- if (NumElements == 2)
+ // Use ds_read_b128 if possible.
+ if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
+ MemVT.getStoreSize() == 16)
return SDValue();
- // If properly aligned, if we split we might be able to use ds_read_b64.
- return SplitVectorLoad(Op, DAG);
+ if (NumElements > 2)
+ return SplitVectorLoad(Op, DAG);
}
return SDValue();
}
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() != MVT::i64)
- return SDValue();
+ EVT VT = Op.getValueType();
+ assert(VT.getSizeInBits() == 64);
SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);
@@ -4797,7 +5636,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
- return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Res);
}
// Catch division cases where we can use shortcuts with rcp and rsq
@@ -4809,8 +5648,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
SDValue RHS = Op.getOperand(1);
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
- Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
+ bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
return SDValue();
@@ -5067,7 +5905,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
SDValue Scale;
- if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
@@ -5165,14 +6003,14 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("unsupported private_element_size");
}
} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+ // Use ds_write_b128 if possible.
+ if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
+ VT.getStoreSize() == 16)
+ return SDValue();
+
if (NumElements > 2)
return SplitVectorStore(Op, DAG);
-
- if (NumElements == 2)
- return Op;
-
- // If properly aligned, if we split we might be able to use ds_write_b64.
- return SplitVectorStore(Op, DAG);
+ return SDValue();
} else {
llvm_unreachable("unhandled address space");
}
@@ -5246,7 +6084,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
// easier if i8 vectors weren't promoted to i32 vectors, particularly after
// types are legalized. v4i8 -> v4f32 is probably the only case to worry
// about in practice.
- if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
+ if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
DCI.AddToWorklist(Cvt.getNode());
@@ -5389,6 +6227,71 @@ static bool isBoolSGPR(SDValue V) {
return false;
}
+// If a constant has all zeroes or all ones within each byte return it.
+// Otherwise return 0.
+static uint32_t getConstantPermuteMask(uint32_t C) {
+ // 0xff for any zero byte in the mask
+ uint32_t ZeroByteMask = 0;
+ if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
+ if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
+ if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
+ if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
+ uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
+ if ((NonZeroByteMask & C) != NonZeroByteMask)
+ return 0; // Partial bytes selected.
+ return C;
+}
+
+// Check if a node selects whole bytes from its operand 0 starting at a byte
+// boundary while masking the rest. Returns select mask as in the v_perm_b32
+// or -1 if not succeeded.
+// Note byte select encoding:
+// value 0-3 selects corresponding source byte;
+// value 0xc selects zero;
+// value 0xff selects 0xff.
+static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+ assert(V.getValueSizeInBits() == 32);
+
+ if (V.getNumOperands() != 2)
+ return ~0;
+
+ ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (!N1)
+ return ~0;
+
+ uint32_t C = N1->getZExtValue();
+
+ switch (V.getOpcode()) {
+ default:
+ break;
+ case ISD::AND:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
+ }
+ break;
+
+ case ISD::OR:
+ if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ return (0x03020100 & ~ConstMask) | ConstMask;
+ }
+ break;
+
+ case ISD::SHL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t((0x030201000c0c0c0cull << C) >> 32);
+
+ case ISD::SRL:
+ if (C % 8)
+ return ~0;
+
+ return uint32_t(0x0c0c0c0c03020100ull >> C);
+ }
+
+ return ~0;
+}
+
SDValue SITargetLowering::performAndCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (DCI.isBeforeLegalize())
@@ -5435,6 +6338,20 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
}
}
}
+
+ // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(Mask);
+ if (!Sel)
+ return SDValue();
+
+ // Select 0xc for all zero bytes
+ Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
}
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -5487,6 +6404,54 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
}
+ // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Each byte in each mask is either selector mask 0-3, or has higher
+ // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
+ // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
+ // mask which is not 0xff wins. By anding both masks we have a correct
+ // result except that 0x0c shall be corrected to give 0x0c only.
+ uint32_t Mask = LHSMask & RHSMask;
+ for (unsigned I = 0; I < 32; I += 8) {
+ uint32_t ByteSel = 0xff << I;
+ if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
+ Mask &= (0x0c << I) & 0xffffffff;
+ }
+
+ // Add 4 to each active LHS lane. It will not affect any existing 0xff
+ // or 0x0c.
+ uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
return SDValue();
}
@@ -5522,6 +6487,60 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
return SDValue();
}
+ // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
+ if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
+ LHS.getOpcode() == AMDGPUISD::PERM &&
+ isa<ConstantSDNode>(LHS.getOperand(2))) {
+ uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
+ if (!Sel)
+ return SDValue();
+
+ Sel |= LHS.getConstantOperandVal(2);
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
+ LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
+ }
+
+ // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
+ N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
+ uint32_t LHSMask = getPermuteMask(DAG, LHS);
+ uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ if (LHSMask != ~0u && RHSMask != ~0u) {
+ // Canonicalize the expression in an attempt to have fewer unique masks
+ // and therefore fewer registers used to hold the masks.
+ if (LHSMask > RHSMask) {
+ std::swap(LHSMask, RHSMask);
+ std::swap(LHS, RHS);
+ }
+
+ // Select 0xc for each lane used from source operand. Zero has 0xc mask
+ // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
+ uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+ uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
+
+ // Check of we need to combine values from two sources within a byte.
+ if (!(LHSUsedLanes & RHSUsedLanes) &&
+ // If we select high and lower word keep it for SDWA.
+ // TODO: teach SDWA to work with v_perm_b32 and remove the check.
+ !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
+ // Kill zero bytes selected by other mask. Zero value is 0xc.
+ LHSMask &= ~RHSUsedLanes;
+ RHSMask &= ~LHSUsedLanes;
+ // Add 4 to each active LHS lane
+ LHSMask |= LHSUsedLanes & 0x04040404;
+ // Combine masks
+ uint32_t Sel = LHSMask | RHSMask;
+ SDLoc DL(N);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
+ LHS.getOperand(0), RHS.getOperand(0),
+ DAG.getConstant(Sel, DL, MVT::i32));
+ }
+ }
+ }
+
if (VT != MVT::i64)
return SDValue();
@@ -5628,6 +6647,7 @@ static bool fp16SrcZerosHighBits(unsigned Opc) {
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::LDEXP:
return true;
default:
@@ -5680,6 +6700,23 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performRcpCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+
+ if (N0.isUndef())
+ return N0;
+
+ if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
+ N0.getOpcode() == ISD::SINT_TO_FP)) {
+ return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
+ N->getFlags());
+ }
+
+ return AMDGPUTargetLowering::performRcpCombine(N, DCI);
+}
+
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
return true;
@@ -5688,7 +6725,7 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
}
static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
- const SISubtarget *ST, unsigned MaxDepth=5) {
+ const GCNSubtarget *ST, unsigned MaxDepth=5) {
// If source is a result of another standard FP operation it is already in
// canonical form.
@@ -5946,7 +6983,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
- VT != MVT::f64 &&
+ !VT.isVector() && VT != MVT::f64 &&
((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)
@@ -6066,15 +7103,87 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
SDValue SITargetLowering::performExtractVectorEltCombine(
SDNode *N, DAGCombinerInfo &DCI) const {
SDValue Vec = N->getOperand(0);
-
SelectionDAG &DAG = DCI.DAG;
- if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
+
+ EVT VecVT = Vec.getValueType();
+ EVT EltVT = VecVT.getVectorElementType();
+
+ if ((Vec.getOpcode() == ISD::FNEG ||
+ Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
SDLoc SL(N);
EVT EltVT = N->getValueType(0);
SDValue Idx = N->getOperand(1);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
Vec.getOperand(0), Idx);
- return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
+ return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
+ }
+
+ // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
+ // =>
+ // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
+ // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
+ // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
+ if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
+ SDLoc SL(N);
+ EVT EltVT = N->getValueType(0);
+ SDValue Idx = N->getOperand(1);
+ unsigned Opc = Vec.getOpcode();
+
+ switch(Opc) {
+ default:
+ return SDValue();
+ // TODO: Support other binary operations.
+ case ISD::FADD:
+ case ISD::ADD:
+ case ISD::UMIN:
+ case ISD::UMAX:
+ case ISD::SMIN:
+ case ISD::SMAX:
+ case ISD::FMAXNUM:
+ case ISD::FMINNUM:
+ return DAG.getNode(Opc, SL, EltVT,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(0), Idx),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+ Vec.getOperand(1), Idx));
+ }
+ }
+
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned VecSize = VecVT.getSizeInBits();
+ unsigned EltSize = EltVT.getSizeInBits();
+
+ // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
+ // elements. This exposes more load reduction opportunities by replacing
+ // multiple small extract_vector_elements with a single 32-bit extract.
+ auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (EltSize <= 16 &&
+ EltVT.isByteSized() &&
+ VecSize > 32 &&
+ VecSize % 32 == 0 &&
+ Idx) {
+ EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
+
+ unsigned BitIndex = Idx->getZExtValue() * EltSize;
+ unsigned EltIdx = BitIndex / 32;
+ unsigned LeftoverBitIdx = BitIndex % 32;
+ SDLoc SL(N);
+
+ SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
+ DCI.AddToWorklist(Cast.getNode());
+
+ SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
+ DAG.getConstant(EltIdx, SL, MVT::i32));
+ DCI.AddToWorklist(Elt.getNode());
+ SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
+ DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
+ DCI.AddToWorklist(Srl.getNode());
+
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
+ DCI.AddToWorklist(Trunc.getNode());
+ return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
}
return SDValue();
@@ -6135,8 +7244,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const TargetOptions &Options = DAG.getTarget().Options;
if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
- (N0->getFlags().hasUnsafeAlgebra() &&
- N1->getFlags().hasUnsafeAlgebra())) &&
+ (N0->getFlags().hasAllowContract() &&
+ N1->getFlags().hasAllowContract())) &&
isFMAFasterThanFMulAndFAdd(VT)) {
return ISD::FMA;
}
@@ -6192,7 +7301,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}
- if (VT != MVT::i32)
+ if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
// add x, zext (setcc) => addcarry x, 0, setcc
@@ -6368,6 +7477,79 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performFMACombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc SL(N);
+
+ if (!Subtarget->hasDLInsts() || VT != MVT::f32)
+ return SDValue();
+
+ // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
+ // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
+ SDValue Op1 = N->getOperand(0);
+ SDValue Op2 = N->getOperand(1);
+ SDValue FMA = N->getOperand(2);
+
+ if (FMA.getOpcode() != ISD::FMA ||
+ Op1.getOpcode() != ISD::FP_EXTEND ||
+ Op2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
+ // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
+ // is sufficient to allow generaing fdot2.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ (N->getFlags().hasAllowContract() &&
+ FMA->getFlags().hasAllowContract())) {
+ Op1 = Op1.getOperand(0);
+ Op2 = Op2.getOperand(0);
+ if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec1 = Op1.getOperand(0);
+ SDValue Idx1 = Op1.getOperand(1);
+ SDValue Vec2 = Op2.getOperand(0);
+
+ SDValue FMAOp1 = FMA.getOperand(0);
+ SDValue FMAOp2 = FMA.getOperand(1);
+ SDValue FMAAcc = FMA.getOperand(2);
+
+ if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
+ FMAOp2.getOpcode() != ISD::FP_EXTEND)
+ return SDValue();
+
+ FMAOp1 = FMAOp1.getOperand(0);
+ FMAOp2 = FMAOp2.getOperand(0);
+ if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ SDValue Vec3 = FMAOp1.getOperand(0);
+ SDValue Vec4 = FMAOp2.getOperand(0);
+ SDValue Idx2 = FMAOp1.getOperand(1);
+
+ if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
+ // Idx1 and Idx2 cannot be the same.
+ Idx1 == Idx2)
+ return SDValue();
+
+ if (Vec1 == Vec2 || Vec3 == Vec4)
+ return SDValue();
+
+ if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
+ return SDValue();
+
+ if ((Vec1 == Vec3 && Vec2 == Vec4) ||
+ (Vec1 == Vec4 && Vec2 == Vec3))
+ return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc);
+ }
+ return SDValue();
+}
+
SDValue SITargetLowering::performSetCCCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -6387,23 +7569,49 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
}
- if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
- isBoolSGPR(LHS.getOperand(0))) {
- // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
- // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
- // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
- // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
- if ((CRHS->isAllOnesValue() &&
- (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
- (CRHS->isNullValue() &&
- (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
- return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
- DAG.getConstant(-1, SL, MVT::i1));
- if ((CRHS->isAllOnesValue() &&
- (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
- (CRHS->isNullValue() &&
- (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
- return LHS.getOperand(0);
+ if (CRHS) {
+ if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
+ isBoolSGPR(LHS.getOperand(0))) {
+ // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
+ // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
+ // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
+ // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
+ if ((CRHS->isAllOnesValue() &&
+ (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
+ (CRHS->isNullValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
+ return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+ DAG.getConstant(-1, SL, MVT::i1));
+ if ((CRHS->isAllOnesValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
+ (CRHS->isNullValue() &&
+ (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
+ return LHS.getOperand(0);
+ }
+
+ uint64_t CRHSVal = CRHS->getZExtValue();
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ LHS.getOpcode() == ISD::SELECT &&
+ isa<ConstantSDNode>(LHS.getOperand(1)) &&
+ isa<ConstantSDNode>(LHS.getOperand(2)) &&
+ LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
+ isBoolSGPR(LHS.getOperand(0))) {
+ // Given CT != FT:
+ // setcc (select cc, CT, CF), CF, eq => xor cc, -1
+ // setcc (select cc, CT, CF), CF, ne => cc
+ // setcc (select cc, CT, CF), CT, ne => xor cc, -1
+ // setcc (select cc, CT, CF), CT, eq => cc
+ uint64_t CT = LHS.getConstantOperandVal(1);
+ uint64_t CF = LHS.getConstantOperandVal(2);
+
+ if ((CF == CRHSVal && CC == ISD::SETEQ) ||
+ (CT == CRHSVal && CC == ISD::SETNE))
+ return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
+ DAG.getConstant(-1, SL, MVT::i1));
+ if ((CF == CRHSVal && CC == ISD::SETNE) ||
+ (CT == CRHSVal && CC == ISD::SETEQ))
+ return LHS.getOperand(0);
+ }
}
if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
@@ -6472,6 +7680,29 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performClampCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CSrc)
+ return SDValue();
+
+ const APFloat &F = CSrc->getValueAPF();
+ APFloat Zero = APFloat::getZero(F.getSemantics());
+ APFloat::cmpResult Cmp0 = F.compare(Zero);
+ if (Cmp0 == APFloat::cmpLessThan ||
+ (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+ return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+ }
+
+ APFloat One(F.getSemantics(), "1.0");
+ APFloat::cmpResult Cmp1 = F.compare(One);
+ if (Cmp1 == APFloat::cmpGreaterThan)
+ return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+ return SDValue(CSrc, 0);
+}
+
+
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
@@ -6503,7 +7734,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performMinMaxCombine(N, DCI);
break;
}
- case ISD::LOAD:
+ case ISD::FMA:
+ return performFMACombine(N, DCI);
+ case ISD::LOAD: {
+ if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
+ return Widended;
+ LLVM_FALLTHROUGH;
+ }
case ISD::STORE:
case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE:
@@ -6521,7 +7758,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_LOAD_UMAX:
case AMDGPUISD::ATOMIC_INC:
- case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
+ case AMDGPUISD::ATOMIC_DEC:
+ case AMDGPUISD::ATOMIC_LOAD_FADD:
+ case AMDGPUISD::ATOMIC_LOAD_FMIN:
+ case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
if (DCI.isBeforeLegalize())
break;
return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
@@ -6537,11 +7777,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performClassCombine(N, DCI);
case ISD::FCANONICALIZE:
return performFCanonicalizeCombine(N, DCI);
- case AMDGPUISD::FRACT:
case AMDGPUISD::RCP:
+ return performRcpCombine(N, DCI);
+ case AMDGPUISD::FRACT:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RSQ_LEGACY:
+ case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::RSQ_CLAMP:
case AMDGPUISD::LDEXP: {
SDValue Src = N->getOperand(0);
@@ -6561,6 +7803,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFMed3Combine(N, DCI);
case AMDGPUISD::CVT_PKRTZ_F16_F32:
return performCvtPkRTZCombine(N, DCI);
+ case AMDGPUISD::CLAMP:
+ return performClampCombine(N, DCI);
case ISD::SCALAR_TO_VECTOR: {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -6587,7 +7831,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
-/// \brief Helper function for adjustWritemask
+/// Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {
default: return 0;
@@ -6598,12 +7842,19 @@ static unsigned SubIdx2Lane(unsigned Idx) {
}
}
-/// \brief Adjust the writemask of MIMG instructions
+/// Adjust the writemask of MIMG instructions
SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
SelectionDAG &DAG) const {
+ unsigned Opcode = Node->getMachineOpcode();
+
+ // Subtract 1 because the vdata output is not a MachineSDNode operand.
+ int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
+ if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
+ return Node; // not implemented for D16
+
SDNode *Users[4] = { nullptr };
unsigned Lane = 0;
- unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+ unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
bool HasChain = Node->getNumValues() > 1;
@@ -6653,9 +7904,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned BitsSet = countPopulation(NewDmask);
- const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
- Node->getMachineOpcode(), BitsSet);
+ int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
assert(NewOpcode != -1 &&
NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
"failed to find equivalent MIMG op");
@@ -6720,7 +7969,7 @@ static bool isFrameIndexOp(SDValue Op) {
return isa<FrameIndexSDNode>(Op);
}
-/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
+/// Legalize target independent instructions (e.g. INSERT_SUBREG)
/// with frame index operands.
/// LLVM assumes that inputs are to these instructions are registers.
SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
@@ -6767,7 +8016,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
return DAG.UpdateNodeOperands(Node, Ops);
}
-/// \brief Fold the instructions after selecting them.
+/// Fold the instructions after selecting them.
/// Returns null if users were already updated.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
@@ -6841,7 +8090,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
return Node;
}
-/// \brief Assign the register class depending on the number of
+/// Assign the register class depending on the number of
/// bits set in the writemask
void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
@@ -6928,7 +8177,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
}
-/// \brief Return a resource descriptor with the 'Add TID' bit enabled
+/// Return a resource descriptor with the 'Add TID' bit enabled
/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
/// of the resource descriptor) to create an offset, which is added to
/// the resource pointer.
@@ -6970,11 +8219,11 @@ std::pair<unsigned, const TargetRegisterClass *>
SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint,
MVT VT) const {
- if (!isTypeLegal(VT))
- return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
-
+ const TargetRegisterClass *RC = nullptr;
if (Constraint.size() == 1) {
switch (Constraint[0]) {
+ default:
+ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
case 's':
case 'r':
switch (VT.getSizeInBits()) {
@@ -6982,40 +8231,56 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, nullptr);
case 32:
case 16:
- return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
+ RC = &AMDGPU::SReg_32_XM0RegClass;
+ break;
case 64:
- return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
+ RC = &AMDGPU::SGPR_64RegClass;
+ break;
case 128:
- return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
+ RC = &AMDGPU::SReg_128RegClass;
+ break;
case 256:
- return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+ RC = &AMDGPU::SReg_256RegClass;
+ break;
case 512:
- return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
+ RC = &AMDGPU::SReg_512RegClass;
+ break;
}
-
+ break;
case 'v':
switch (VT.getSizeInBits()) {
default:
return std::make_pair(0U, nullptr);
case 32:
case 16:
- return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
+ RC = &AMDGPU::VGPR_32RegClass;
+ break;
case 64:
- return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
+ RC = &AMDGPU::VReg_64RegClass;
+ break;
case 96:
- return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
+ RC = &AMDGPU::VReg_96RegClass;
+ break;
case 128:
- return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
+ RC = &AMDGPU::VReg_128RegClass;
+ break;
case 256:
- return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
+ RC = &AMDGPU::VReg_256RegClass;
+ break;
case 512:
- return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
+ RC = &AMDGPU::VReg_512RegClass;
+ break;
}
+ break;
}
+ // We actually support i128, i16 and f16 as inline parameters
+ // even if they are not reported as legal
+ if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
+ VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
+ return std::make_pair(0U, RC);
}
if (Constraint.size() > 1) {
- const TargetRegisterClass *RC = nullptr;
if (Constraint[1] == 'v') {
RC = &AMDGPU::VGPR_32RegClass;
} else if (Constraint[1] == 's') {
@@ -7052,8 +8317,7 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
if (Info->isEntryFunction()) {
// Callable functions have fixed registers used for stack access.
@@ -7083,6 +8347,8 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
Info->getScratchWaveOffsetReg());
+ Info->limitOccupancy(MF);
+
TargetLoweringBase::finalizeLowering(MF);
}
@@ -7103,3 +8369,69 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
// calculation won't overflow, so assume the sign bit is never set.
Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
}
+
+bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
+ FunctionLoweringInfo * FLI, DivergenceAnalysis * DA) const
+{
+ switch (N->getOpcode()) {
+ case ISD::Register:
+ case ISD::CopyFromReg:
+ {
+ const RegisterSDNode *R = nullptr;
+ if (N->getOpcode() == ISD::Register) {
+ R = dyn_cast<RegisterSDNode>(N);
+ }
+ else {
+ R = dyn_cast<RegisterSDNode>(N->getOperand(1));
+ }
+ if (R)
+ {
+ const MachineFunction * MF = FLI->MF;
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+ unsigned Reg = R->getReg();
+ if (TRI.isPhysicalRegister(Reg))
+ return TRI.isVGPR(MRI, Reg);
+
+ if (MRI.isLiveIn(Reg)) {
+ // workitem.id.x workitem.id.y workitem.id.z
+ // Any VGPR formal argument is also considered divergent
+ if (TRI.isVGPR(MRI, Reg))
+ return true;
+ // Formal arguments of non-entry functions
+ // are conservatively considered divergent
+ else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
+ return true;
+ }
+ return !DA || DA->isDivergent(FLI->getValueFromVirtualReg(Reg));
+ }
+ }
+ break;
+ case ISD::LOAD: {
+ const LoadSDNode *L = dyn_cast<LoadSDNode>(N);
+ if (L->getMemOperand()->getAddrSpace() ==
+ Subtarget->getAMDGPUAS().PRIVATE_ADDRESS)
+ return true;
+ } break;
+ case ISD::CALLSEQ_END:
+ return true;
+ break;
+ case ISD::INTRINSIC_WO_CHAIN:
+ {
+
+ }
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+ case ISD::INTRINSIC_W_CHAIN:
+ return AMDGPU::isIntrinsicSourceOfDivergence(
+ cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+ // In some cases intrinsics that are a source of divergence have been
+ // lowered to AMDGPUISD so we also need to check those too.
+ case AMDGPUISD::INTERP_MOV:
+ case AMDGPUISD::INTERP_P1:
+ case AMDGPUISD::INTERP_P2:
+ return true;
+ }
+ return false;
+}