aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp1725
1 files changed, 1263 insertions, 462 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 544867513d9c..51241cf0a432 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18,33 +18,46 @@
#include <cmath>
#endif
-#include "SIISelLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
+#include "SIISelLowering.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
-#include "llvm/ADT/SmallString.h"
using namespace llvm;
-SITargetLowering::SITargetLowering(TargetMachine &TM,
- const AMDGPUSubtarget &STI)
+// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv.
+static cl::opt<bool> EnableAMDGPUFastFDIV(
+ "amdgpu-fast-fdiv",
+ cl::desc("Enable faster 2.5 ulp fdiv"),
+ cl::init(false));
+
+static unsigned findFirstFreeSGPR(CCState &CCInfo) {
+ unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
+ for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
+ if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
+ return AMDGPU::SGPR0 + Reg;
+ }
+ }
+ llvm_unreachable("Cannot allocate sgpr");
+}
+
+SITargetLowering::SITargetLowering(const TargetMachine &TM,
+ const SISubtarget &STI)
: AMDGPUTargetLowering(TM, STI) {
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
- addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
-
addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
@@ -66,34 +79,25 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
computeRegisterProperties(STI.getRegisterInfo());
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
-
- setOperationAction(ISD::ADD, MVT::i32, Legal);
- setOperationAction(ISD::ADDC, MVT::i32, Legal);
- setOperationAction(ISD::ADDE, MVT::i32, Legal);
- setOperationAction(ISD::SUBC, MVT::i32, Legal);
- setOperationAction(ISD::SUBE, MVT::i32, Legal);
-
- setOperationAction(ISD::FSIN, MVT::f32, Custom);
- setOperationAction(ISD::FCOS, MVT::f32, Custom);
-
- setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-
// We need to custom lower vector stores from local memory
+ setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::i1, Custom);
+ setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+ setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
-
setOperationAction(ISD::STORE, MVT::i1, Custom);
- setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+ setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+ setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+ setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
+
+ setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -102,109 +106,39 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
+ setOperationAction(ISD::SETCC, MVT::i1, Promote);
setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
- setOperationAction(ISD::BSWAP, MVT::i32, Legal);
- setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+ setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
- for (MVT VT : MVT::integer_valuetypes()) {
- if (VT == MVT::i64)
- continue;
-
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
-
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
-
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
- }
-
- for (MVT VT : MVT::integer_vector_valuetypes()) {
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
- }
-
- for (MVT VT : MVT::fp_valuetypes())
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
-
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
-
- setTruncStoreAction(MVT::i64, MVT::i32, Expand);
- setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
-
- setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
-
- setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
- setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
-
- setOperationAction(ISD::LOAD, MVT::i1, Custom);
-
- setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
- AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
-
- setOperationAction(ISD::STORE, MVT::v2i64, Promote);
- AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
-
- setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
-
- setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
- setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
- setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
-
- // These should use UDIVREM, so set them to expand
- setOperationAction(ISD::UDIV, MVT::i64, Expand);
- setOperationAction(ISD::UREM, MVT::i64, Expand);
-
- setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
- setOperationAction(ISD::SELECT, MVT::i1, Promote);
-
- setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
-
-
- setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i1, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::i64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC, MVT::f64, Expand);
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
- switch(Op) {
+ switch (Op) {
case ISD::LOAD:
case ISD::STORE:
case ISD::BUILD_VECTOR:
@@ -241,13 +175,46 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
+
+ // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
+ // and output demarshalling
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+ // We can't return success/failure, only the old value,
+ // let LLVM add the comparison
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
+ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
+
+ if (getSubtarget()->hasFlatAddressSpace()) {
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+ }
+
+ setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+
+ // On SI this is s_memtime and s_memrealtime on VI.
+ setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
+ setOperationAction(ISD::TRAP, MVT::Other, Custom);
+
+ setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+
+ if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
setOperationAction(ISD::FRINT, MVT::f64, Legal);
}
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+
+ setOperationAction(ISD::FSIN, MVT::f32, Custom);
+ setOperationAction(ISD::FCOS, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);
@@ -263,6 +230,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::FCANONICALIZE);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -287,10 +255,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM,
setSchedulingPreference(Sched::RegPressure);
}
+const SISubtarget *SITargetLowering::getSubtarget() const {
+ return static_cast<const SISubtarget *>(Subtarget);
+}
+
//===----------------------------------------------------------------------===//
// TargetLowering queries
//===----------------------------------------------------------------------===//
+bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+ const CallInst &CI,
+ unsigned IntrID) const {
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align = 0;
+ Info.vol = false;
+ Info.readMem = true;
+ Info.writeMem = true;
+ return true;
+ default:
+ return false;
+ }
+}
+
bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
EVT) const {
// SI has some legal vector types, but no legal vector operations. Say no
@@ -348,7 +339,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
switch (AS) {
case AMDGPUAS::GLOBAL_ADDRESS: {
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
// Assume the we will use FLAT for all global memory accesses
// on VI.
// FIXME: This assumption is currently wrong. On VI we still use
@@ -376,16 +367,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (DL.getTypeStoreSize(Ty) < 4)
return isLegalMUBUFAddressingMode(AM);
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
// SMRD instructions have an 8-bit, dword offset on SI.
if (!isUInt<8>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
+ } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
// On CI+, this can also be a 32-bit literal constant offset. If it fits
// in 8-bits, it can use a smaller encoding.
if (!isUInt<32>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
@@ -402,7 +393,6 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
case AMDGPUAS::PRIVATE_ADDRESS:
- case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
return isLegalMUBUFAddressingMode(AM);
case AMDGPUAS::LOCAL_ADDRESS:
@@ -423,6 +413,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return false;
}
case AMDGPUAS::FLAT_ADDRESS:
+ case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+ // For an unknown address space, this usually means that this is for some
+ // reason being used for pure arithmetic, and not based on some addressing
+ // computation. We don't have instructions that compute pointers with any
+ // addressing modes, so treat them as having no offset like flat
+ // instructions.
return isLegalFlatAddressingMode(AM);
default:
@@ -442,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
if (!VT.isSimple() || VT == MVT::Other)
return false;
- // TODO - CI+ supports unaligned memory accesses, but this requires driver
- // support.
-
- // XXX - The only mention I see of this in the ISA manual is for LDS direct
- // reads the "byte address and must be dword aligned". Is it also true for the
- // normal loads and stores?
- if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
+ if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+ AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
// aligned, 8 byte access in a single operation using ds_read2/write2_b32
// with adjacent offsets.
bool AlignedBy4 = (Align % 4 == 0);
if (IsFast)
*IsFast = AlignedBy4;
+
return AlignedBy4;
}
+ if (Subtarget->hasUnalignedBufferAccess()) {
+ // If we have an uniform constant load, it still requires using a slow
+ // buffer instruction if unaligned.
+ if (IsFast) {
+ *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+ (Align % 4 == 0) : true;
+ }
+
+ return true;
+ }
+
// Smaller than dword value must be aligned.
- // FIXME: This should be allowed on CI+
if (VT.bitsLT(MVT::i32))
return false;
@@ -500,21 +502,22 @@ static bool isFlatGlobalAddrSpace(unsigned AS) {
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
- return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+ return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
}
-
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
// UndefValue means this is a load of a kernel input. These are uniform.
- // Sometimes LDS instructions have constant pointers
- if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
- isa<GlobalValue>(Ptr))
+ // Sometimes LDS instructions have constant pointers.
+ // If Ptr is null, then that means this mem operand contains a
+ // PseudoSourceValue like GOT.
+ if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+ isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
return true;
- const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
+ const Instruction *I = dyn_cast<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
@@ -528,29 +531,42 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const {
bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
return TII->isInlineConstant(Imm);
}
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
- SDLoc SL, SDValue Chain,
- unsigned Offset, bool Signed) const {
+bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
+
+ // SimplifySetCC uses this function to determine whether or not it should
+ // create setcc with i1 operands. We don't have instructions for i1 setcc.
+ if (VT == MVT::i1 && Op == ISD::SETCC)
+ return false;
+
+ return TargetLowering::isTypeDesirableForOp(Op, VT);
+}
+
+SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
+ const SDLoc &SL, SDValue Chain,
+ unsigned Offset) const {
const DataLayout &DL = DAG.getDataLayout();
MachineFunction &MF = DAG.getMachineFunction();
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
- Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
- PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
- SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
- DAG.getConstant(Offset, SL, PtrVT));
+ return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+ DAG.getConstant(Offset, SL, PtrVT));
+}
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+ const SDLoc &SL, SDValue Chain,
+ unsigned Offset, bool Signed) const {
+ const DataLayout &DL = DAG.getDataLayout();
+ Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+ MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+ PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
SDValue PtrOffset = DAG.getUNDEF(PtrVT);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
@@ -560,34 +576,35 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
if (MemVT.isFloatingPoint())
ExtTy = ISD::EXTLOAD;
- return DAG.getLoad(ISD::UNINDEXED, ExtTy,
- VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
- false, // isVolatile
- true, // isNonTemporal
- true, // isInvariant
- Align); // Alignment
+ SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
+ return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset,
+ PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MOInvariant);
}
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
- const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
- SmallVectorImpl<SDValue> &InVals) const {
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+ SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
MachineFunction &MF = DAG.getMachineFunction();
FunctionType *FType = MF.getFunction()->getFunctionType();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
- if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
+ if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
const Function *Fn = MF.getFunction();
- DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA");
+ DiagnosticInfoUnsupported NoGraphicsHSA(
+ *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
DAG.getContext()->diagnose(NoGraphicsHSA);
- return SDValue();
+ return DAG.getEntryNode();
}
- // FIXME: We currently assume all calling conventions are kernels.
+ // Create stack objects that are used for emitting debugger prologue if
+ // "amdgpu-debugger-emit-prologue" attribute was specified.
+ if (ST.debuggerEmitPrologue())
+ createDebuggerPrologueStackObjects(MF);
SmallVector<ISD::InputArg, 16> Splits;
BitVector Skipped(Ins.size());
@@ -596,7 +613,7 @@ SDValue SITargetLowering::LowerFormalArguments(
const ISD::InputArg &Arg = Ins[i];
// First check if it's a PS input addr
- if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+ if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
!Arg.Flags.isByVal() && PSInputNum <= 15) {
if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
@@ -613,25 +630,26 @@ SDValue SITargetLowering::LowerFormalArguments(
++PSInputNum;
}
- // Second split vertices into their elements
- if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) {
- ISD::InputArg NewArg = Arg;
- NewArg.Flags.setSplit();
- NewArg.VT = Arg.VT.getVectorElementType();
-
- // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
- // three or five element vertex only needs three or five registers,
- // NOT four or eight.
- Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
- unsigned NumElements = ParamType->getVectorNumElements();
-
- for (unsigned j = 0; j != NumElements; ++j) {
- Splits.push_back(NewArg);
- NewArg.PartOffset += NewArg.VT.getStoreSize();
+ if (AMDGPU::isShader(CallConv)) {
+ // Second split vertices into their elements
+ if (Arg.VT.isVector()) {
+ ISD::InputArg NewArg = Arg;
+ NewArg.Flags.setSplit();
+ NewArg.VT = Arg.VT.getVectorElementType();
+
+ // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+ // three or five element vertex only needs three or five registers,
+ // NOT four or eight.
+ Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+ unsigned NumElements = ParamType->getVectorNumElements();
+
+ for (unsigned j = 0; j != NumElements; ++j) {
+ Splits.push_back(NewArg);
+ NewArg.PartOffset += NewArg.VT.getStoreSize();
+ }
+ } else {
+ Splits.push_back(Arg);
}
-
- } else if (Info->getShaderType() != ShaderType::COMPUTE) {
- Splits.push_back(Arg);
}
}
@@ -651,19 +669,27 @@ SDValue SITargetLowering::LowerFormalArguments(
// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
// enabled too.
- if (Info->getShaderType() == ShaderType::PIXEL &&
+ if (CallConv == CallingConv::AMDGPU_PS &&
((Info->getPSInputAddr() & 0x7F) == 0 ||
- ((Info->getPSInputAddr() & 0xF) == 0 &&
- Info->isPSInputAllocated(11)))) {
+ ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
CCInfo.AllocateReg(AMDGPU::VGPR0);
CCInfo.AllocateReg(AMDGPU::VGPR1);
Info->markPSInputAllocated(0);
Info->PSInputEna |= 1;
}
- if (Info->getShaderType() == ShaderType::COMPUTE) {
+ if (!AMDGPU::isShader(CallConv)) {
getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
Splits);
+
+ assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
+ } else {
+ assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+ !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+ !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+ !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+ !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+ !Info->hasWorkItemIDZ());
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
@@ -679,12 +705,24 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(DispatchPtrReg);
}
+ if (Info->hasQueuePtr()) {
+ unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+ MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(QueuePtrReg);
+ }
+
if (Info->hasKernargSegmentPtr()) {
unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
CCInfo.AllocateReg(InputPtrReg);
}
+ if (Info->hasFlatScratchInit()) {
+ unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+ MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
+ CCInfo.AllocateReg(FlatScratchInitReg);
+ }
+
AnalyzeFormalArguments(CCInfo, Splits);
SmallVector<SDValue, 16> Chains;
@@ -713,7 +751,7 @@ SDValue SITargetLowering::LowerFormalArguments(
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
@@ -765,7 +803,7 @@ SDValue SITargetLowering::LowerFormalArguments(
NumElements = Arg.VT.getVectorNumElements() - NumElements;
Regs.append(NumElements, DAG.getUNDEF(VT));
- InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
+ InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
continue;
}
@@ -780,8 +818,7 @@ SDValue SITargetLowering::LowerFormalArguments(
unsigned Reg = Info->addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
CCInfo.AllocateReg(Reg);
- } else
- llvm_unreachable("work group id x is always enabled");
+ }
if (Info->hasWorkGroupIDY()) {
unsigned Reg = Info->addWorkGroupIDY();
@@ -803,8 +840,13 @@ SDValue SITargetLowering::LowerFormalArguments(
if (Info->hasPrivateSegmentWaveByteOffset()) {
// Scratch wave offset passed in system SGPR.
- unsigned PrivateSegmentWaveByteOffsetReg
- = Info->addPrivateSegmentWaveByteOffset();
+ unsigned PrivateSegmentWaveByteOffsetReg;
+
+ if (AMDGPU::isShader(CallConv)) {
+ PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
+ Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+ } else
+ PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
@@ -812,8 +854,11 @@ SDValue SITargetLowering::LowerFormalArguments(
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
-
bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+ // Record that we know we have non-spill stack objects so we don't need to
+ // check all stack objects later.
+ if (HasStackObjects)
+ Info->setHasNonSpillStackObjects(true);
if (ST.isAmdHsaOS()) {
// TODO: Assume we will spill without optimizations.
@@ -866,8 +911,7 @@ SDValue SITargetLowering::LowerFormalArguments(
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
CCInfo.AllocateReg(Reg);
- } else
- llvm_unreachable("workitem id x should always be enabled");
+ }
if (Info->hasWorkItemIDY()) {
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
@@ -887,16 +931,16 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
-SDValue SITargetLowering::LowerReturn(SDValue Chain,
- CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- SDLoc DL, SelectionDAG &DAG) const {
+SDValue
+SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- if (Info->getShaderType() == ShaderType::COMPUTE)
+ if (!AMDGPU::isShader(CallConv))
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
OutVals, DL, DAG);
@@ -975,17 +1019,131 @@ SDValue SITargetLowering::LowerReturn(SDValue Chain,
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+ unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+ return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
-MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
- MachineInstr * MI, MachineBasicBlock * BB) const {
-
- switch (MI->getOpcode()) {
+unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("m0", AMDGPU::M0)
+ .Case("exec", AMDGPU::EXEC)
+ .Case("exec_lo", AMDGPU::EXEC_LO)
+ .Case("exec_hi", AMDGPU::EXEC_HI)
+ .Case("flat_scratch", AMDGPU::FLAT_SCR)
+ .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
+ .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
+ .Default(AMDGPU::NoRegister);
+
+ if (Reg == AMDGPU::NoRegister) {
+ report_fatal_error(Twine("invalid register name \""
+ + StringRef(RegName) + "\"."));
+
+ }
+
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+ Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
+ report_fatal_error(Twine("invalid register \""
+ + StringRef(RegName) + "\" for subtarget."));
+ }
+
+ switch (Reg) {
+ case AMDGPU::M0:
+ case AMDGPU::EXEC_LO:
+ case AMDGPU::EXEC_HI:
+ case AMDGPU::FLAT_SCR_LO:
+ case AMDGPU::FLAT_SCR_HI:
+ if (VT.getSizeInBits() == 32)
+ return Reg;
+ break;
+ case AMDGPU::EXEC:
+ case AMDGPU::FLAT_SCR:
+ if (VT.getSizeInBits() == 64)
+ return Reg;
+ break;
default:
- return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+ llvm_unreachable("missing register type checking");
+ }
+
+ report_fatal_error(Twine("invalid type for register \""
+ + StringRef(RegName) + "\"."));
+}
+
+// If kill is not the last instruction, split the block so kill is always a
+// proper terminator.
+MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ MachineBasicBlock::iterator SplitPoint(&MI);
+ ++SplitPoint;
+
+ if (SplitPoint == BB->end()) {
+ // Don't bother with a new block.
+ MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ return BB;
+ }
+
+ MachineFunction *MF = BB->getParent();
+ MachineBasicBlock *SplitBB
+ = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+ // Fix the block phi references to point to the new block for the defs in the
+ // second piece of the block.
+ for (MachineBasicBlock *Succ : BB->successors()) {
+ for (MachineInstr &MI : *Succ) {
+ if (!MI.isPHI())
+ break;
+
+ for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) {
+ MachineOperand &FromBB = MI.getOperand(I);
+ if (BB == FromBB.getMBB()) {
+ FromBB.setMBB(SplitBB);
+ break;
+ }
+ }
+ }
+ }
+
+ MF->insert(++MachineFunction::iterator(BB), SplitBB);
+ SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+
+ SplitBB->transferSuccessors(BB);
+ BB->addSuccessor(SplitBB);
+
+ MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+ return SplitBB;
+}
+
+MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
+ MachineInstr &MI, MachineBasicBlock *BB) const {
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_INIT_M0: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
+ TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addOperand(MI.getOperand(0));
+ MI.eraseFromParent();
+ break;
+ }
case AMDGPU::BRANCH:
return BB;
+ case AMDGPU::GET_GROUPSTATICSIZE: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+
+ MachineFunction *MF = BB->getParent();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ DebugLoc DL = MI.getDebugLoc();
+ BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
+ .addOperand(MI.getOperand(0))
+ .addImm(MFI->LDSSize);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::SI_KILL:
+ return splitKillBlock(MI, BB);
+ default:
+ return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
return BB;
}
@@ -1072,6 +1230,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerTrig(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::FDIV: return LowerFDIV(Op, DAG);
+ case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::GlobalAddress: {
MachineFunction &MF = DAG.getMachineFunction();
@@ -1079,7 +1238,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerGlobalAddress(MFI, Op, DAG);
}
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
+ case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
+ case ISD::TRAP: return lowerTRAP(Op, DAG);
}
return SDValue();
}
@@ -1106,25 +1268,78 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();
- // A FrameIndex node represents a 32-bit offset into scratch memory. If
- // the high bit of a frame index offset were to be set, this would mean
- // that it represented an offset of ~2GB * 64 = ~128GB from the start of the
- // scratch buffer, with 64 being the number of threads per wave.
+ // A FrameIndex node represents a 32-bit offset into scratch memory. If the
+ // high bit of a frame index offset were to be set, this would mean that it
+ // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
+ // buffer, with 64 being the number of threads per wave.
//
- // If we know the machine uses less than 128GB of scratch, then we can
- // amrk the high bit of the FrameIndex node as known zero,
- // which is important, because it means in most situations we can
- // prove that values derived from FrameIndex nodes are non-negative.
- // This enables us to take advantage of more addressing modes when
- // accessing scratch buffers, since for scratch reads/writes, the register
- // offset must always be positive.
+ // The maximum private allocation for the entire GPU is 4G, and we are
+ // concerned with the largest the index could ever be for an individual
+ // workitem. This will occur with the minmum dispatch size. If a program
+ // requires more, the dispatch size will be reduced.
+ //
+ // With this limit, we can mark the high bit of the FrameIndex node as known
+ // zero, which is important, because it means in most situations we can prove
+ // that values derived from FrameIndex nodes are non-negative. This enables us
+ // to take advantage of more addressing modes when accessing scratch buffers,
+ // since for scratch reads/writes, the register offset must always be
+ // positive.
- SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
- if (Subtarget->enableHugeScratchBuffer())
- return TFI;
+ uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
+ // XXX - It is unclear if partial dispatch works. Assume it works at half wave
+ // granularity. It is probably a full wave.
+ uint64_t MinGranularity = 32;
+
+ unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
+ EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
+
+ SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
- DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
+ DAG.getValueType(ExtVT));
+}
+
+bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+ if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return false;
+
+ switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
+ default: return false;
+ case AMDGPUIntrinsic::amdgcn_if:
+ case AMDGPUIntrinsic::amdgcn_else:
+ case AMDGPUIntrinsic::amdgcn_break:
+ case AMDGPUIntrinsic::amdgcn_if_break:
+ case AMDGPUIntrinsic::amdgcn_else_break:
+ case AMDGPUIntrinsic::amdgcn_loop:
+ case AMDGPUIntrinsic::amdgcn_end_cf:
+ return true;
+ }
+}
+
+void SITargetLowering::createDebuggerPrologueStackObjects(
+ MachineFunction &MF) const {
+ // Create stack objects that are used for emitting debugger prologue.
+ //
+ // Debugger prologue writes work group IDs and work item IDs to scratch memory
+ // at fixed location in the following format:
+ // offset 0: work group ID x
+ // offset 4: work group ID y
+ // offset 8: work group ID z
+ // offset 16: work item ID x
+ // offset 20: work item ID y
+ // offset 24: work item ID z
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ int ObjectIdx = 0;
+
+ // For each dimension:
+ for (unsigned i = 0; i < 3; ++i) {
+ // Create fixed stack object for work group ID.
+ ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true);
+ Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
+ // Create fixed stack object for work item ID.
+ ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true);
+ Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
+ }
}
/// This transforms the control flow intrinsics to get the branch destination as
@@ -1137,13 +1352,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
SDNode *Intr = BRCOND.getOperand(1).getNode();
SDValue Target = BRCOND.getOperand(2);
SDNode *BR = nullptr;
+ SDNode *SetCC = nullptr;
if (Intr->getOpcode() == ISD::SETCC) {
// As long as we negate the condition everything is fine
- SDNode *SetCC = Intr;
- assert(SetCC->getConstantOperandVal(1) == 1);
- assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
- ISD::SETNE);
+ SetCC = Intr;
Intr = SetCC->getOperand(0).getNode();
} else {
@@ -1152,7 +1365,15 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
Target = BR->getOperand(1);
}
- assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+ if (!isCFIntrinsic(Intr)) {
+ // This is a uniform branch so we don't need to legalize.
+ return BRCOND;
+ }
+
+ assert(!SetCC ||
+ (SetCC->getConstantOperandVal(1) == 1 &&
+ cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
+ ISD::SETNE));
// Build the result and
ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
@@ -1204,37 +1425,185 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
return Chain;
}
+SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+ SelectionDAG &DAG) const {
+ SDLoc SL;
+ MachineFunction &MF = DAG.getMachineFunction();
+ SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+ assert(UserSGPR != AMDGPU::NoRegister);
+
+ SDValue QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+ // Offset into amd_queue_t for group_segment_aperture_base_hi /
+ // private_segment_aperture_base_hi.
+ uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+
+ SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
+ DAG.getConstant(StructOffset, SL, MVT::i64));
+
+ // TODO: Use custom target PseudoSourceValue.
+ // TODO: We should use the value from the IR intrinsic call, but it might not
+ // be available and how do we get it?
+ Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
+ AMDGPUAS::CONSTANT_ADDRESS));
+
+ MachinePointerInfo PtrInfo(V, StructOffset);
+ return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+ MinAlign(64, StructOffset),
+ MachineMemOperand::MOInvariant);
+}
+
+SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
+
+ SDValue Src = ASC->getOperand(0);
+
+ // FIXME: Really support non-0 null pointers.
+ SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
+ SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+
+ // flat -> local/private
+ if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
+ SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+ return DAG.getNode(ISD::SELECT, SL, MVT::i32,
+ NonNull, Ptr, SegmentNullPtr);
+ }
+ }
+
+ // local/private -> flat
+ if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+ if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+ SDValue NonNull
+ = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
+
+ SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+ SDValue CvtPtr
+ = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+
+ return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+ FlatNullPtr);
+ }
+ }
+
+ // global <-> flat are no-ops and never emitted.
+
+ const MachineFunction &MF = DAG.getMachineFunction();
+ DiagnosticInfoUnsupported InvalidAddrSpaceCast(
+ *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
+ DAG.getContext()->diagnose(InvalidAddrSpaceCast);
+
+ return DAG.getUNDEF(ASC->getValueType(0));
+}
+
+static bool shouldEmitGOTReloc(const GlobalValue *GV,
+ const TargetMachine &TM) {
+ return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
+}
+
+bool
+SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+ // We can fold offsets for anything that doesn't require a GOT relocation.
+ return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+ !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine());
+}
+
+static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
+ SDLoc DL, unsigned Offset, EVT PtrVT,
+ unsigned GAFlags = SIInstrInfo::MO_NONE) {
+ // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
+ // lowered to the following code sequence:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol
+ // s_addc_u32 s1, s1, 0
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // a fixup or relocation is emitted to replace $symbol with a literal
+ // constant, which is a pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // What we want here is an offset from the value returned by s_getpc
+ // (which is the address of the s_add_u32 instruction) to the global
+ // variable, but since the encoding of $symbol starts 4 bytes after the start
+ // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+ // small. This requires us to add 4 to the global variable offset in order to
+ // compute the correct address.
+ SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
+ GAFlags);
+ return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA);
+}
+
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,
SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
- if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+ if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+ GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
SDLoc DL(GSD);
const GlobalValue *GV = GSD->getGlobal();
- MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
+ EVT PtrVT = Op.getValueType();
+
+ if (!shouldEmitGOTReloc(GV, getTargetMachine()))
+ return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
+
+ SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
+ SIInstrInfo::MO_GOTPCREL);
+
+ Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
+ PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+ const DataLayout &DataLayout = DAG.getDataLayout();
+ unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+ // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
+ MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
- SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
- return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
+ return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+ MachineMemOperand::MOInvariant);
}
-SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
- SDValue V) const {
+SDValue SITargetLowering::lowerTRAP(SDValue Op,
+ SelectionDAG &DAG) const {
+ const MachineFunction &MF = DAG.getMachineFunction();
+ DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
+ "trap handler not supported",
+ Op.getDebugLoc(),
+ DS_Warning);
+ DAG.getContext()->diagnose(NoTrap);
+
+ // Emit s_endpgm.
+
+ // FIXME: This should really be selected to s_trap, but that requires
+ // setting up the trap handler for it o do anything.
+ return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+ Op.getOperand(0));
+}
+
+SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
+ const SDLoc &DL, SDValue V) const {
+ // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
+ // the destination register.
+ //
// We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
// so we will end up with redundant moves to m0.
//
- // We can't use S_MOV_B32, because there is no way to specify m0 as the
- // destination register.
- //
- // We have to use them both. Machine cse will combine all the S_MOV_B32
- // instructions and the register coalescer eliminate the extra copies.
- SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V);
- return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32),
- SDValue(M0, 0), SDValue()); // Glue
- // A Null SDValue creates
- // a glue result.
+ // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
+
+ // A Null SDValue creates a glue result.
+ SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
+ V, Chain);
+ return SDValue(M0, 0);
}
SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
@@ -1249,12 +1618,27 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
DAG.getValueType(VT));
}
+static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+ DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ "non-hsa intrinsic with hsa target",
+ DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+}
+
+static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) {
+ DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
+ "intrinsic not supported on subtarget",
+ DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
auto MFI = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI =
- static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
EVT VT = Op.getValueType();
SDLoc DL(Op);
@@ -1264,62 +1648,134 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
case Intrinsic::amdgcn_dispatch_ptr:
+ case Intrinsic::amdgcn_queue_ptr: {
if (!Subtarget->isAmdHsaOS()) {
- DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(),
- "hsa intrinsic without hsa target");
+ DiagnosticInfoUnsupported BadIntrin(
+ *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
+ DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);
return DAG.getUNDEF(VT);
}
+ auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
+ SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
- TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
-
+ TRI->getPreloadedValue(MF, Reg), VT);
+ }
+ case Intrinsic::amdgcn_implicitarg_ptr: {
+ unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
+ return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+ }
+ case Intrinsic::amdgcn_kernarg_segment_ptr: {
+ unsigned Reg
+ = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+ return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+ }
+ case Intrinsic::amdgcn_rcp:
+ return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
+ case Intrinsic::amdgcn_rsq:
+ case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
+ return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ case Intrinsic::amdgcn_rsq_legacy: {
+ if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
+ return emitRemovedIntrinsicError(DAG, DL, VT);
+
+ return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+ }
+ case Intrinsic::amdgcn_rsq_clamp: {
+ if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+
+ Type *Type = VT.getTypeForEVT(*DAG.getContext());
+ APFloat Max = APFloat::getLargest(Type->getFltSemantics());
+ APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
+
+ SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
+ DAG.getConstantFP(Max, DL, VT));
+ return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
+ DAG.getConstantFP(Min, DL, VT));
+ }
case Intrinsic::r600_read_ngroups_x:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_X, false);
case Intrinsic::r600_read_ngroups_y:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_Y, false);
case Intrinsic::r600_read_ngroups_z:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::NGROUPS_Z, false);
case Intrinsic::r600_read_global_size_x:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
case Intrinsic::r600_read_global_size_y:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
case Intrinsic::r600_read_global_size_z:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
case Intrinsic::r600_read_local_size_x:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return lowerImplicitZextParam(DAG, Op, MVT::i16,
SI::KernelInputOffsets::LOCAL_SIZE_X);
case Intrinsic::r600_read_local_size_y:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return lowerImplicitZextParam(DAG, Op, MVT::i16,
SI::KernelInputOffsets::LOCAL_SIZE_Y);
case Intrinsic::r600_read_local_size_z:
+ if (Subtarget->isAmdHsaOS())
+ return emitNonHSAIntrinsicError(DAG, DL, VT);
+
return lowerImplicitZextParam(DAG, Op, MVT::i16,
SI::KernelInputOffsets::LOCAL_SIZE_Z);
- case Intrinsic::AMDGPU_read_workdim:
+ case Intrinsic::amdgcn_read_workdim:
+ case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
// Really only 2 bits.
return lowerImplicitZextParam(DAG, Op, MVT::i8,
getImplicitParameterOffset(MFI, GRID_DIM));
+ case Intrinsic::amdgcn_workgroup_id_x:
case Intrinsic::r600_read_tgid_x:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
+ case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
+ case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
+ case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
+ case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
+ case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
@@ -1336,24 +1792,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
Op->getVTList(), Ops, VT, MMO);
}
- case AMDGPUIntrinsic::SI_sample:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
- case AMDGPUIntrinsic::SI_sampleb:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
- case AMDGPUIntrinsic::SI_sampled:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
- case AMDGPUIntrinsic::SI_samplel:
- return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
case AMDGPUIntrinsic::SI_vs_load_input:
return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
- case AMDGPUIntrinsic::AMDGPU_fract:
- case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
- return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1),
- DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1)));
case AMDGPUIntrinsic::SI_fs_constant: {
SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
SDValue Glue = M0.getValue(1);
@@ -1393,11 +1837,93 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
Glue);
}
+ case Intrinsic::amdgcn_sin:
+ return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::amdgcn_cos:
+ return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::amdgcn_log_clamp: {
+ if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
+ return SDValue();
+
+ DiagnosticInfoUnsupported BadIntrin(
+ *MF.getFunction(), "intrinsic not supported on subtarget",
+ DL.getDebugLoc());
+ DAG.getContext()->diagnose(BadIntrin);
+ return DAG.getUNDEF(VT);
+ }
+ case Intrinsic::amdgcn_ldexp:
+ return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+
+ case Intrinsic::amdgcn_fract:
+ return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::amdgcn_class:
+ return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_div_fmas:
+ return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+ Op.getOperand(4));
+
+ case Intrinsic::amdgcn_div_fixup:
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+ case Intrinsic::amdgcn_trig_preop:
+ return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::amdgcn_div_scale: {
+ // 3rd parameter required to be a constant.
+ const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ if (!Param)
+ return DAG.getUNDEF(VT);
+
+ // Translate to the operands expected by the machine instruction. The
+ // first parameter must be the same as the first instruction.
+ SDValue Numerator = Op.getOperand(1);
+ SDValue Denominator = Op.getOperand(2);
+
+ // Note this order is opposite of the machine instruction's operations,
+ // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
+ // intrinsic has the numerator as the first operand to match a normal
+ // division operation.
+
+ SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+
+ return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
+ Denominator, Numerator);
+ }
default:
return AMDGPUTargetLowering::LowerOperation(Op, DAG);
}
}
+SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec: {
+ MemSDNode *M = cast<MemSDNode>(Op);
+ unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
+ AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
+ SDValue Ops[] = {
+ M->getOperand(0), // Chain
+ M->getOperand(2), // Ptr
+ M->getOperand(3) // Value
+ };
+
+ return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
+ M->getMemoryVT(), M->getMemOperand());
+ }
+ default:
+ return SDValue();
+ }
+}
+
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -1439,6 +1965,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
Op->getVTList(), Ops, VT, MMO);
}
+ case AMDGPUIntrinsic::AMDGPU_kill: {
+ if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) {
+ if (!K->isNegative())
+ return Chain;
+ }
+
+ return Op;
+ }
default:
return SDValue();
}
@@ -1447,48 +1981,92 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
+ ISD::LoadExtType ExtType = Load->getExtensionType();
+ EVT MemVT = Load->getMemoryVT();
- if (Op.getValueType().isVector()) {
- assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
- "Custom lowering for non-i32 vectors hasn't been implemented.");
- unsigned NumElements = Op.getValueType().getVectorNumElements();
- assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
+ if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
+ assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
+ // FIXME: Copied from PPC
+ // First, load into 32 bits, then truncate to 1 bit.
- switch (Load->getAddressSpace()) {
- default: break;
- case AMDGPUAS::CONSTANT_ADDRESS:
- if (isMemOpUniform(Load))
- break;
- // Non-uniform loads will be selected to MUBUF instructions, so they
- // have the same legalization requires ments as global and private
- // loads.
- //
- // Fall-through
- case AMDGPUAS::GLOBAL_ADDRESS:
- case AMDGPUAS::PRIVATE_ADDRESS:
- if (NumElements >= 8)
- return SplitVectorLoad(Op, DAG);
-
- // v4 loads are supported for private and global memory.
- if (NumElements <= 4)
- break;
- // fall-through
- case AMDGPUAS::LOCAL_ADDRESS:
- // If properly aligned, if we split we might be able to use ds_read_b64.
+ SDValue Chain = Load->getChain();
+ SDValue BasePtr = Load->getBasePtr();
+ MachineMemOperand *MMO = Load->getMemOperand();
+
+ SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+ BasePtr, MVT::i8, MMO);
+
+ SDValue Ops[] = {
+ DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
+ NewLD.getValue(1)
+ };
+
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ if (!MemVT.isVector())
+ return SDValue();
+
+ assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
+ "Custom lowering for non-i32 vectors hasn't been implemented.");
+
+ unsigned AS = Load->getAddressSpace();
+ if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+ AS, Load->getAlignment())) {
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ unsigned NumElements = MemVT.getVectorNumElements();
+ switch (AS) {
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ if (isMemOpUniform(Load))
+ return SDValue();
+ // Non-uniform loads will be selected to MUBUF instructions, so they
+ // have the same legalization requires ments as global and private
+ // loads.
+ //
+ // Fall-through
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::FLAT_ADDRESS:
+ if (NumElements > 4)
+ return SplitVectorLoad(Op, DAG);
+ // v4 loads are supported for private and global memory.
+ return SDValue();
+ case AMDGPUAS::PRIVATE_ADDRESS: {
+ // Depending on the setting of the private_element_size field in the
+ // resource descriptor, we can only make private accesses up to a certain
+ // size.
+ switch (Subtarget->getMaxPrivateElementSize()) {
+ case 4:
+ return scalarizeVectorLoad(Load, DAG);
+ case 8:
+ if (NumElements > 2)
+ return SplitVectorLoad(Op, DAG);
+ return SDValue();
+ case 16:
+ // Same as global/flat
+ if (NumElements > 4)
return SplitVectorLoad(Op, DAG);
+ return SDValue();
+ default:
+ llvm_unreachable("unsupported private_element_size");
}
}
+ case AMDGPUAS::LOCAL_ADDRESS: {
+ if (NumElements > 2)
+ return SplitVectorLoad(Op, DAG);
- return AMDGPUTargetLowering::LowerLOAD(Op, DAG);
-}
+ if (NumElements == 2)
+ return SDValue();
-SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
- const SDValue &Op,
- SelectionDAG &DAG) const {
- return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3),
- Op.getOperand(4));
+ // If properly aligned, if we split we might be able to use ds_read_b64.
+ return SplitVectorLoad(Op, DAG);
+ }
+ default:
+ return SDValue();
+ }
}
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -1514,7 +2092,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
- SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
+ SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
}
@@ -1547,7 +2125,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
}
}
- if (Unsafe) {
+ const SDNodeFlags *Flags = Op->getFlags();
+
+ if (Unsafe || Flags->hasAllowReciprocal()) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
SDNodeFlags Flags;
@@ -1560,45 +2140,71 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
- SDValue FastLowered = LowerFastFDIV(Op, DAG);
- if (FastLowered.getNode())
+ if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
return FastLowered;
- // This uses v_rcp_f32 which does not handle denormals. Let this hit a
- // selection error for now rather than do something incorrect.
- if (Subtarget->hasFP32Denormals())
- return SDValue();
-
SDLoc SL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
- SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+ // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag
+ if (EnableAMDGPUFastFDIV) {
+ // This does not support denormals.
+ SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+
+ const APFloat K0Val(BitsToFloat(0x6f800000));
+ const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+
+ const APFloat K1Val(BitsToFloat(0x2f800000));
+ const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+
+ const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
+
+ EVT SetCCVT =
+ getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+
+ SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+
+ SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+
+ // TODO: Should this propagate fast-math-flags?
- const APFloat K0Val(BitsToFloat(0x6f800000));
- const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
+ r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
- const APFloat K1Val(BitsToFloat(0x2f800000));
- const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
+ // rcp does not support denormals.
+ SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+
+ return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+ }
+
+ // Generates more precise fpdiv32.
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
- EVT SetCCVT =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+ SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
- SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
+ SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS);
+ SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS);
- SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ // Denominator is scaled to not be denormal, so using rcp is ok.
+ SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled);
- // TODO: Should this propagate fast-math-flags?
+ SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled);
- r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+ SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One);
+ SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp);
- SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+ SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled);
+ SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul);
+ SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled);
- return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+ SDValue Scale = NumeratorScaled.getValue(1);
+ SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale);
+
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
}
SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
@@ -1635,7 +2241,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
SDValue Scale;
- if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+ if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
// Workaround a hardware bug on SI where the condition output from div_scale
// is not usable.
@@ -1685,26 +2291,57 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
EVT VT = Store->getMemoryVT();
- // These stores are legal.
- if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
- if (VT.isVector() && VT.getVectorNumElements() > 4)
- return ScalarizeVectorStore(Op, DAG);
- return SDValue();
+ if (VT == MVT::i1) {
+ return DAG.getTruncStore(Store->getChain(), DL,
+ DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
+ Store->getBasePtr(), MVT::i1, Store->getMemOperand());
}
- SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
- if (Ret.getNode())
- return Ret;
+ assert(VT.isVector() &&
+ Store->getValue().getValueType().getScalarType() == MVT::i32);
- if (VT.isVector() && VT.getVectorNumElements() >= 8)
+ unsigned AS = Store->getAddressSpace();
+ if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+ AS, Store->getAlignment())) {
+ return expandUnalignedStore(Store, DAG);
+ }
+
+ unsigned NumElements = VT.getVectorNumElements();
+ switch (AS) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::FLAT_ADDRESS:
+ if (NumElements > 4)
+ return SplitVectorStore(Op, DAG);
+ return SDValue();
+ case AMDGPUAS::PRIVATE_ADDRESS: {
+ switch (Subtarget->getMaxPrivateElementSize()) {
+ case 4:
+ return scalarizeVectorStore(Store, DAG);
+ case 8:
+ if (NumElements > 2)
+ return SplitVectorStore(Op, DAG);
+ return SDValue();
+ case 16:
+ if (NumElements > 4)
+ return SplitVectorStore(Op, DAG);
+ return SDValue();
+ default:
+ llvm_unreachable("unsupported private_element_size");
+ }
+ }
+ case AMDGPUAS::LOCAL_ADDRESS: {
+ if (NumElements > 2)
return SplitVectorStore(Op, DAG);
- if (VT == MVT::i1)
- return DAG.getTruncStore(Store->getChain(), DL,
- DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
- Store->getBasePtr(), MVT::i1, Store->getMemOperand());
+ if (NumElements == 2)
+ return Op;
- return SDValue();
+ // If properly aligned, if we split we might be able to use ds_write_b64.
+ return SplitVectorStore(Op, DAG);
+ }
+ default:
+ llvm_unreachable("unhandled address space");
+ }
}
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
@@ -1727,6 +2364,33 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
}
}
+SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+ AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
+ assert(AtomicNode->isCompareAndSwap());
+ unsigned AS = AtomicNode->getAddressSpace();
+
+ // No custom lowering required for local address space
+ if (!isFlatGlobalAddrSpace(AS))
+ return Op;
+
+ // Non-local address space requires custom lowering for atomic compare
+ // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
+ SDLoc DL(Op);
+ SDValue ChainIn = Op.getOperand(0);
+ SDValue Addr = Op.getOperand(1);
+ SDValue Old = Op.getOperand(2);
+ SDValue New = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ MVT SimpleVT = VT.getSimpleVT();
+ MVT VecType = MVT::getVectorVT(SimpleVT, 2);
+
+ SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
+ SDValue Ops[] = { ChainIn, Addr, NewOld };
+
+ return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
+ Ops, VT, AtomicNode->getMemOperand());
+}
+
//===----------------------------------------------------------------------===//
// Custom DAG optimizations
//===----------------------------------------------------------------------===//
@@ -1756,88 +2420,13 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
}
}
- // We are primarily trying to catch operations on illegal vector types
- // before they are expanded.
- // For scalars, we can use the more flexible method of checking masked bits
- // after legalization.
- if (!DCI.isBeforeLegalize() ||
- !SrcVT.isVector() ||
- SrcVT.getVectorElementType() != MVT::i8) {
- return SDValue();
- }
-
- assert(DCI.isBeforeLegalize() && "Unexpected legal type");
-
- // Weird sized vectors are a pain to handle, but we know 3 is really the same
- // size as 4.
- unsigned NElts = SrcVT.getVectorNumElements();
- if (!SrcVT.isSimple() && NElts != 3)
- return SDValue();
-
- // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
- // prevent a mess from expanding to v4i32 and repacking.
- if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
- EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
- EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
- EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
- LoadSDNode *Load = cast<LoadSDNode>(Src);
-
- unsigned AS = Load->getAddressSpace();
- unsigned Align = Load->getAlignment();
- Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
- unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
-
- // Don't try to replace the load if we have to expand it due to alignment
- // problems. Otherwise we will end up scalarizing the load, and trying to
- // repack into the vector for no real reason.
- if (Align < ABIAlignment &&
- !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
- return SDValue();
- }
-
- SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
- Load->getChain(),
- Load->getBasePtr(),
- LoadVT,
- Load->getMemOperand());
-
- // Make sure successors of the original load stay after it by updating
- // them to use the new Chain.
- DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
-
- SmallVector<SDValue, 4> Elts;
- if (RegVT.isVector())
- DAG.ExtractVectorElements(NewLoad, Elts);
- else
- Elts.push_back(NewLoad);
-
- SmallVector<SDValue, 4> Ops;
-
- unsigned EltIdx = 0;
- for (SDValue Elt : Elts) {
- unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
- for (unsigned I = 0; I < ComponentsInElt; ++I) {
- unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
- SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
- DCI.AddToWorklist(Cvt.getNode());
- Ops.push_back(Cvt);
- }
-
- ++EltIdx;
- }
-
- assert(Ops.size() == NElts);
-
- return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
- }
-
return SDValue();
}
/// \brief Return true if the given offset Size in bytes can be folded into
/// the immediate offsets of a memory instruction for the given address space.
static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
- const AMDGPUSubtarget &STI) {
+ const SISubtarget &STI) {
switch (AS) {
case AMDGPUAS::GLOBAL_ADDRESS: {
// MUBUF instructions a 12-bit offset in bytes.
@@ -1846,7 +2435,7 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
case AMDGPUAS::CONSTANT_ADDRESS: {
// SMRD instructions have an 8-bit offset in dwords on SI and
// a 20-bit offset in bytes on VI.
- if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
return isUInt<20>(OffsetSize);
else
return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
@@ -1897,7 +2486,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
// If the resulting offset is too large, we can't fold it into the addressing
// mode offset.
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
- if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
+ if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -1915,6 +2504,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
if (DCI.isBeforeLegalize())
return SDValue();
+ if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
+ return Base;
+
SelectionDAG &DAG = DCI.DAG;
// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
@@ -1970,6 +2562,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::i64) {
+ // TODO: This could be a generic combine with a predicate for extracting the
+ // high half of an integer being free.
+
+ // (or i64:x, (zero_extend i32:y)) ->
+ // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
+ if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
+ RHS.getOpcode() != ISD::ZERO_EXTEND)
+ std::swap(LHS, RHS);
+
+ if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue ExtSrc = RHS.getOperand(0);
+ EVT SrcVT = ExtSrc.getValueType();
+ if (SrcVT == MVT::i32) {
+ SDLoc SL(N);
+ SDValue LowLHS, HiBits;
+ std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
+ SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
+
+ DCI.AddToWorklist(LowOr.getNode());
+ DCI.AddToWorklist(HiBits.getNode());
+
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ LowOr, HiBits);
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+ }
+ }
+ }
+
// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
@@ -2005,9 +2627,52 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
return DAG.getConstant(0, SDLoc(N), MVT::i1);
}
+ if (N->getOperand(0).isUndef())
+ return DAG.getUNDEF(MVT::i1);
+
return SDValue();
}
+// Constant fold canonicalize.
+SDValue SITargetLowering::performFCanonicalizeCombine(
+ SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+ if (!CFP)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ const APFloat &C = CFP->getValueAPF();
+
+ // Flush denormals to 0 if not enabled.
+ if (C.isDenormal()) {
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+ return DAG.getConstantFP(0.0, SDLoc(N), VT);
+
+ if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+ return DAG.getConstantFP(0.0, SDLoc(N), VT);
+ }
+
+ if (C.isNaN()) {
+ EVT VT = N->getValueType(0);
+ APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
+ if (C.isSignaling()) {
+ // Quiet a signaling NaN.
+ return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+ }
+
+ // Make sure it is the canonical NaN bitpattern.
+ //
+ // TODO: Can we use -1 as the canonical NaN value since it's an inline
+ // immediate?
+ if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
+ return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
+ }
+
+ return SDValue(CFP, 0);
+}
+
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
switch (Opc) {
case ISD::FMAXNUM:
@@ -2027,8 +2692,64 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
}
}
-SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
- DAGCombinerInfo &DCI) const {
+static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1, bool Signed) {
+ ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
+ if (!K1)
+ return SDValue();
+
+ ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+ if (!K0)
+ return SDValue();
+
+ if (Signed) {
+ if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+ return SDValue();
+ } else {
+ if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+ return SDValue();
+ }
+
+ EVT VT = K0->getValueType(0);
+ return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+ Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+}
+
+static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
+ if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
+ return true;
+
+ return DAG.isKnownNeverNaN(Op);
+}
+
+static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+ SDValue Op0, SDValue Op1) {
+ ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
+ if (!K1)
+ return SDValue();
+
+ ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
+ if (!K0)
+ return SDValue();
+
+ // Ordered >= (although NaN inputs should have folded away by now).
+ APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
+ if (Cmp == APFloat::cmpGreaterThan)
+ return SDValue();
+
+ // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
+ // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
+ // give the other result, which is different from med3 with a NaN input.
+ SDValue Var = Op0.getOperand(0);
+ if (!isKnownNeverSNan(DAG, Var))
+ return SDValue();
+
+ return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
+ Var, SDValue(K0, 0), SDValue(K1, 0));
+}
+
+SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
unsigned Opc = N->getOpcode();
@@ -2038,26 +2759,51 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N,
// Only do this if the inner op has one use since this will just increases
// register pressure for no benefit.
- // max(max(a, b), c)
- if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
- SDLoc DL(N);
- return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
- DL,
- N->getValueType(0),
- Op0.getOperand(0),
- Op0.getOperand(1),
- Op1);
+ if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+ // max(max(a, b), c) -> max3(a, b, c)
+ // min(min(a, b), c) -> min3(a, b, c)
+ if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
+ SDLoc DL(N);
+ return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+ DL,
+ N->getValueType(0),
+ Op0.getOperand(0),
+ Op0.getOperand(1),
+ Op1);
+ }
+
+ // Try commuted.
+ // max(a, max(b, c)) -> max3(a, b, c)
+ // min(a, min(b, c)) -> min3(a, b, c)
+ if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
+ SDLoc DL(N);
+ return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
+ DL,
+ N->getValueType(0),
+ Op0,
+ Op1.getOperand(0),
+ Op1.getOperand(1));
+ }
}
- // max(a, max(b, c))
- if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
- SDLoc DL(N);
- return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
- DL,
- N->getValueType(0),
- Op0,
- Op1.getOperand(0),
- Op1.getOperand(1));
+ // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+ if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+ return Med3;
+ }
+
+ if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+ return Med3;
+ }
+
+ // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
+ if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
+ (Opc == AMDGPUISD::FMIN_LEGACY &&
+ Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
+ N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+ if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
+ return Res;
}
return SDValue();
@@ -2104,16 +2850,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
case ISD::SETCC:
return performSetCCCombine(N, DCI);
- case ISD::FMAXNUM: // TODO: What about fmax_legacy?
+ case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
- case ISD::UMIN: {
+ case ISD::UMIN:
+ case AMDGPUISD::FMIN_LEGACY:
+ case AMDGPUISD::FMAX_LEGACY: {
if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
N->getValueType(0) != MVT::f64 &&
getTargetMachine().getOptLevel() > CodeGenOpt::None)
- return performMin3Max3Combine(N, DCI);
+ return performMinMaxCombine(N, DCI);
break;
}
@@ -2122,8 +2870,23 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::CVT_F32_UBYTE2:
case AMDGPUISD::CVT_F32_UBYTE3: {
unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
-
SDValue Src = N->getOperand(0);
+
+ // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
+ if (Src.getOpcode() == ISD::SRL) {
+ // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
+ // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
+ // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
+
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
+ unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
+ if (SrcOffset < 32 && SrcOffset % 8 == 0) {
+ return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
+ MVT::f32, Src.getOperand(0));
+ }
+ }
+ }
+
APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
APInt KnownZero, KnownOne;
@@ -2238,7 +3001,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ATOMIC_LOAD_MIN:
case ISD::ATOMIC_LOAD_MAX:
case ISD::ATOMIC_LOAD_UMIN:
- case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics.
+ case ISD::ATOMIC_LOAD_UMAX:
+ case AMDGPUISD::ATOMIC_INC:
+ case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
if (DCI.isBeforeLegalize())
break;
@@ -2264,6 +3029,19 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performOrCombine(N, DCI);
case AMDGPUISD::FP_CLASS:
return performClassCombine(N, DCI);
+ case ISD::FCANONICALIZE:
+ return performFCanonicalizeCombine(N, DCI);
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RSQ_LEGACY:
+ case AMDGPUISD::RSQ_CLAMP:
+ case AMDGPUISD::LDEXP: {
+ SDValue Src = N->getOperand(0);
+ if (Src.isUndef())
+ return Src;
+ break;
+ }
}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}
@@ -2273,9 +3051,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
/// and the immediate value if it's a literal immediate
int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
-
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
if (TII->isInlineConstant(Node->getAPIntValue()))
@@ -2314,7 +3090,8 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
SelectionDAG &DAG) const {
SDNode *Users[4] = { };
unsigned Lane = 0;
- unsigned OldDmask = Node->getConstantOperandVal(0);
+ unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
+ unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
unsigned NewDmask = 0;
// Try to figure out the used register components
@@ -2354,8 +3131,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Adjust the writemask in the node
std::vector<SDValue> Ops;
+ Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
- Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
+ Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
// If we only got one lane, replace it with a copy
@@ -2421,14 +3199,15 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
/// \brief Fold the instructions after selecting them.
SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
SelectionDAG &DAG) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ unsigned Opcode = Node->getMachineOpcode();
- if (TII->isMIMG(Node->getMachineOpcode()))
+ if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+ !TII->isGather4(Opcode))
adjustWritemask(Node, DAG);
- if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
- Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+ if (Opcode == AMDGPU::INSERT_SUBREG ||
+ Opcode == AMDGPU::REG_SEQUENCE) {
legalizeTargetIndependentNode(Node, DAG);
return Node;
}
@@ -2437,22 +3216,22 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
/// \brief Assign the register class depending on the number of
/// bits set in the writemask
-void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+ MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
- if (TII->isVOP3(MI->getOpcode())) {
+ if (TII->isVOP3(MI.getOpcode())) {
// Make sure constant bus requirements are respected.
TII->legalizeOperandsVOP3(MRI, MI);
return;
}
- if (TII->isMIMG(*MI)) {
- unsigned VReg = MI->getOperand(0).getReg();
- unsigned Writemask = MI->getOperand(1).getImm();
+ if (TII->isMIMG(MI)) {
+ unsigned VReg = MI.getOperand(0).getReg();
+ unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
+ unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
unsigned BitsSet = 0;
for (unsigned i = 0; i < 4; ++i)
BitsSet += Writemask & (1 << i) ? 1 : 0;
@@ -2465,34 +3244,58 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
case 3: RC = &AMDGPU::VReg_96RegClass; break;
}
- unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
- MI->setDesc(TII->get(NewOpcode));
+ unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
+ MI.setDesc(TII->get(NewOpcode));
MRI.setRegClass(VReg, RC);
return;
}
// Replace unused atomics with the no return version.
- int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
+ int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
if (NoRetAtomicOp != -1) {
if (!Node->hasAnyUseOfValue(0)) {
- MI->setDesc(TII->get(NoRetAtomicOp));
- MI->RemoveOperand(0);
+ MI.setDesc(TII->get(NoRetAtomicOp));
+ MI.RemoveOperand(0);
+ return;
}
+ // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
+ // instruction, because the return type of these instructions is a vec2 of
+ // the memory type, so it can be tied to the input operand.
+ // This means these instructions always have a use, so we need to add a
+ // special case to check if the atomic has only one extract_subreg use,
+ // which itself has no uses.
+ if ((Node->hasNUsesOfValue(1, 0) &&
+ Node->use_begin()->isMachineOpcode() &&
+ Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
+ !Node->use_begin()->hasAnyUseOfValue(0))) {
+ unsigned Def = MI.getOperand(0).getReg();
+
+ // Change this into a noret atomic.
+ MI.setDesc(TII->get(NoRetAtomicOp));
+ MI.RemoveOperand(0);
+
+ // If we only remove the def operand from the atomic instruction, the
+ // extract_subreg will be left with a use of a vreg without a def.
+ // So we need to insert an implicit_def to avoid machine verifier
+ // errors.
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::IMPLICIT_DEF), Def);
+ }
return;
}
}
-static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
+static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
+ uint64_t Val) {
SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
}
MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
- SDLoc DL,
+ const SDLoc &DL,
SDValue Ptr) const {
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
// Build the half of the subregister with the constants before building the
// full 128-bit register. If we are building multiple resource descriptors,
@@ -2524,10 +3327,8 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
/// of the resource descriptor) to create an offset, which is added to
/// the resource pointer.
-MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
- SDLoc DL,
- SDValue Ptr,
- uint32_t RsrcDword1,
+MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Ptr, uint32_t RsrcDword1,
uint64_t RsrcDword2And3) const {
SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);