diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 1725 |
1 files changed, 1263 insertions, 462 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 544867513d9c..51241cf0a432 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -18,33 +18,46 @@ #include <cmath> #endif -#include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" -#include "llvm/ADT/SmallString.h" using namespace llvm; -SITargetLowering::SITargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) +// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv. +static cl::opt<bool> EnableAMDGPUFastFDIV( + "amdgpu-fast-fdiv", + cl::desc("Enable faster 2.5 ulp fdiv"), + cl::init(false)); + +static unsigned findFirstFreeSGPR(CCState &CCInfo) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { + if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { + return AMDGPU::SGPR0 + Reg; + } + } + llvm_unreachable("Cannot allocate sgpr"); +} + +SITargetLowering::SITargetLowering(const TargetMachine &TM, + const SISubtarget &STI) : AMDGPUTargetLowering(TM, STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); - addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); @@ -66,34 +79,25 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, computeRegisterProperties(STI.getRegisterInfo()); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - - setOperationAction(ISD::ADD, MVT::i32, Legal); - setOperationAction(ISD::ADDC, MVT::i32, Legal); - setOperationAction(ISD::ADDE, MVT::i32, Legal); - setOperationAction(ISD::SUBC, MVT::i32, Legal); - setOperationAction(ISD::SUBE, MVT::i32, Legal); - - setOperationAction(ISD::FSIN, MVT::f32, Custom); - setOperationAction(ISD::FCOS, MVT::f32, Custom); - - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + + setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -102,109 +106,39 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SETCC, MVT::i1, Promote); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - setOperationAction(ISD::BSWAP, MVT::i32, Legal); - setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); - - for (MVT VT : MVT::integer_valuetypes()) { - if (VT == MVT::i64) - continue; - - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); - } - - for (MVT VT : MVT::fp_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); - - setTruncStoreAction(MVT::i64, MVT::i32, Expand); - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - - - setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); - - setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); - setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); - - setOperationAction(ISD::LOAD, MVT::i1, Custom); - - setOperationAction(ISD::LOAD, MVT::v2i64, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); - - setOperationAction(ISD::STORE, MVT::v2i64, Promote); - AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); - - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - - // These should use UDIVREM, so set them to expand - setOperationAction(ISD::UDIV, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT, MVT::i1, Promote); - - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); - - - setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { - switch(Op) { + switch (Op) { case ISD::LOAD: case ISD::STORE: case ISD::BUILD_VECTOR: @@ -241,13 +175,46 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, + // and output demarshalling + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + + // We can't return success/failure, only the old value, + // let LLVM add the comparison + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + + if (getSubtarget()->hasFlatAddressSpace()) { + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); + } + + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + + // On SI this is s_memtime and s_memrealtime on VI. + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + setOperationAction(ISD::TRAP, MVT::Other, Custom); + + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + + if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); } setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); @@ -263,6 +230,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FCANONICALIZE); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -287,10 +255,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); } +const SISubtarget *SITargetLowering::getSubtarget() const { + return static_cast<const SISubtarget *>(Subtarget); +} + //===----------------------------------------------------------------------===// // TargetLowering queries //===----------------------------------------------------------------------===// +bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &CI, + unsigned IntrID) const { + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = true; + return true; + default: + return false; + } +} + bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no @@ -348,7 +339,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. // FIXME: This assumption is currently wrong. On VI we still use @@ -376,16 +367,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (DL.getTypeStoreSize(Ty) < 4) return isLegalMUBUFAddressingMode(AM); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // SMRD instructions have an 8-bit, dword offset on SI. if (!isUInt<8>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { // On CI+, this can also be a 32-bit literal constant offset. If it fits // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -402,7 +393,6 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: return isLegalMUBUFAddressingMode(AM); case AMDGPUAS::LOCAL_ADDRESS: @@ -423,6 +413,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; } case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + // For an unknown address space, this usually means that this is for some + // reason being used for pure arithmetic, and not based on some addressing + // computation. We don't have instructions that compute pointers with any + // addressing modes, so treat them as having no offset like flat + // instructions. return isLegalFlatAddressingMode(AM); default: @@ -442,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // TODO - CI+ supports unaligned memory accesses, but this requires driver - // support. - - // XXX - The only mention I see of this in the ISA manual is for LDS direct - // reads the "byte address and must be dword aligned". Is it also true for the - // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. bool AlignedBy4 = (Align % 4 == 0); if (IsFast) *IsFast = AlignedBy4; + return AlignedBy4; } + if (Subtarget->hasUnalignedBufferAccess()) { + // If we have an uniform constant load, it still requires using a slow + // buffer instruction if unaligned. + if (IsFast) { + *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? + (Align % 4 == 0) : true; + } + + return true; + } + // Smaller than dword value must be aligned. - // FIXME: This should be allowed on CI+ if (VT.bitsLT(MVT::i32)) return false; @@ -500,21 +502,22 @@ static bool isFlatGlobalAddrSpace(unsigned AS) { bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } - bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers - if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) || - isa<GlobalValue>(Ptr)) + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || + isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) return true; - const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); + const Instruction *I = dyn_cast<Instruction>(Ptr); return I && I->getMetadata("amdgpu.uniform"); } @@ -528,29 +531,42 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); return TII->isInlineConstant(Imm); } -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - SDLoc SL, SDValue Chain, - unsigned Offset, bool Signed) const { +bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { + + // SimplifySetCC uses this function to determine whether or not it should + // create setcc with i1 operands. We don't have instructions for i1 setcc. + if (VT == MVT::i1 && Op == ISD::SETCC) + return false; + + return TargetLowering::isTypeDesirableForOp(Op, VT); +} + +SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, + const SDLoc &SL, SDValue Chain, + unsigned Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg), PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Offset, SL, PtrVT)); + return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Offset, SL, PtrVT)); +} +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + unsigned Offset, bool Signed) const { + const DataLayout &DL = DAG.getDataLayout(); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue PtrOffset = DAG.getUNDEF(PtrVT); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); @@ -560,34 +576,35 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, if (MemVT.isFloatingPoint()) ExtTy = ISD::EXTLOAD; - return DAG.getLoad(ISD::UNINDEXED, ExtTy, - VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment + SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); + return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, + PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant); } SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { const Function *Fn = MF.getFunction(); - DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DiagnosticInfoUnsupported NoGraphicsHSA( + *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); - return SDValue(); + return DAG.getEntryNode(); } - // FIXME: We currently assume all calling conventions are kernels. + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); SmallVector<ISD::InputArg, 16> Splits; BitVector Skipped(Ins.size()); @@ -596,7 +613,7 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && + if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && !Arg.Flags.isByVal() && PSInputNum <= 15) { if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { @@ -613,25 +630,26 @@ SDValue SITargetLowering::LowerFormalArguments( ++PSInputNum; } - // Second split vertices into their elements - if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); + if (AMDGPU::isShader(CallConv)) { + // Second split vertices into their elements + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eight. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + } else { + Splits.push_back(Arg); } - - } else if (Info->getShaderType() != ShaderType::COMPUTE) { - Splits.push_back(Arg); } } @@ -651,19 +669,27 @@ SDValue SITargetLowering::LowerFormalArguments( // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be // enabled too. - if (Info->getShaderType() == ShaderType::PIXEL && + if (CallConv == CallingConv::AMDGPU_PS && ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && - Info->isPSInputAllocated(11)))) { + ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); Info->markPSInputAllocated(0); Info->PSInputEna |= 1; } - if (Info->getShaderType() == ShaderType::COMPUTE) { + if (!AMDGPU::isShader(CallConv)) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); + + assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); + } else { + assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && + !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? @@ -679,12 +705,24 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(DispatchPtrReg); } + if (Info->hasQueuePtr()) { + unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(QueuePtrReg); + } + if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); CCInfo.AllocateReg(InputPtrReg); } + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector<SDValue, 16> Chains; @@ -713,7 +751,7 @@ SDValue SITargetLowering::LowerFormalArguments( auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be @@ -765,7 +803,7 @@ SDValue SITargetLowering::LowerFormalArguments( NumElements = Arg.VT.getVectorNumElements() - NumElements; Regs.append(NumElements, DAG.getUNDEF(VT)); - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); + InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); continue; } @@ -780,8 +818,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = Info->addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("work group id x is always enabled"); + } if (Info->hasWorkGroupIDY()) { unsigned Reg = Info->addWorkGroupIDY(); @@ -803,8 +840,13 @@ SDValue SITargetLowering::LowerFormalArguments( if (Info->hasPrivateSegmentWaveByteOffset()) { // Scratch wave offset passed in system SGPR. - unsigned PrivateSegmentWaveByteOffsetReg - = Info->addPrivateSegmentWaveByteOffset(); + unsigned PrivateSegmentWaveByteOffsetReg; + + if (AMDGPU::isShader(CallConv)) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } else + PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); @@ -812,8 +854,11 @@ SDValue SITargetLowering::LowerFormalArguments( // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to + // check all stack objects later. + if (HasStackObjects) + Info->setHasNonSpillStackObjects(true); if (ST.isAmdHsaOS()) { // TODO: Assume we will spill without optimizations. @@ -866,8 +911,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("workitem id x should always be enabled"); + } if (Info->hasWorkItemIDY()) { unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); @@ -887,16 +931,16 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } -SDValue SITargetLowering::LowerReturn(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const { +SDValue +SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - if (Info->getShaderType() == ShaderType::COMPUTE) + if (!AMDGPU::isShader(CallConv)) return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, OutVals, DL, DAG); @@ -975,17 +1019,131 @@ SDValue SITargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); + unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; + return DAG.getNode(Opc, DL, MVT::Other, RetOps); } -MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { - - switch (MI->getOpcode()) { +unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("m0", AMDGPU::M0) + .Case("exec", AMDGPU::EXEC) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Default(AMDGPU::NoRegister); + + if (Reg == AMDGPU::NoRegister) { + report_fatal_error(Twine("invalid register name \"" + + StringRef(RegName) + "\".")); + + } + + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { + report_fatal_error(Twine("invalid register \"" + + StringRef(RegName) + "\" for subtarget.")); + } + + switch (Reg) { + case AMDGPU::M0: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + if (VT.getSizeInBits() == 32) + return Reg; + break; + case AMDGPU::EXEC: + case AMDGPU::FLAT_SCR: + if (VT.getSizeInBits() == 64) + return Reg; + break; default: - return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + llvm_unreachable("missing register type checking"); + } + + report_fatal_error(Twine("invalid type for register \"" + + StringRef(RegName) + "\".")); +} + +// If kill is not the last instruction, split the block so kill is always a +// proper terminator. +MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator SplitPoint(&MI); + ++SplitPoint; + + if (SplitPoint == BB->end()) { + // Don't bother with a new block. + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return BB; + } + + MachineFunction *MF = BB->getParent(); + MachineBasicBlock *SplitBB + = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + // Fix the block phi references to point to the new block for the defs in the + // second piece of the block. + for (MachineBasicBlock *Succ : BB->successors()) { + for (MachineInstr &MI : *Succ) { + if (!MI.isPHI()) + break; + + for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &FromBB = MI.getOperand(I); + if (BB == FromBB.getMBB()) { + FromBB.setMBB(SplitBB); + break; + } + } + } + } + + MF->insert(++MachineFunction::iterator(BB), SplitBB); + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + + SplitBB->transferSuccessors(BB); + BB->addSuccessor(SplitBB); + + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return SplitBB; +} + +MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { + case AMDGPU::SI_INIT_M0: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addOperand(MI.getOperand(0)); + MI.eraseFromParent(); + break; + } case AMDGPU::BRANCH: return BB; + case AMDGPU::GET_GROUPSTATICSIZE: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineFunction *MF = BB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32)) + .addOperand(MI.getOperand(0)) + .addImm(MFI->LDSSize); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::SI_KILL: + return splitKillBlock(MI, BB); + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } return BB; } @@ -1072,6 +1230,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); @@ -1079,7 +1238,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalAddress(MFI, Op, DAG); } case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); + case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); + case ISD::TRAP: return lowerTRAP(Op, DAG); } return SDValue(); } @@ -1106,25 +1268,78 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FINode->getIndex(); - // A FrameIndex node represents a 32-bit offset into scratch memory. If - // the high bit of a frame index offset were to be set, this would mean - // that it represented an offset of ~2GB * 64 = ~128GB from the start of the - // scratch buffer, with 64 being the number of threads per wave. + // A FrameIndex node represents a 32-bit offset into scratch memory. If the + // high bit of a frame index offset were to be set, this would mean that it + // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch + // buffer, with 64 being the number of threads per wave. // - // If we know the machine uses less than 128GB of scratch, then we can - // amrk the high bit of the FrameIndex node as known zero, - // which is important, because it means in most situations we can - // prove that values derived from FrameIndex nodes are non-negative. - // This enables us to take advantage of more addressing modes when - // accessing scratch buffers, since for scratch reads/writes, the register - // offset must always be positive. + // The maximum private allocation for the entire GPU is 4G, and we are + // concerned with the largest the index could ever be for an individual + // workitem. This will occur with the minmum dispatch size. If a program + // requires more, the dispatch size will be reduced. + // + // With this limit, we can mark the high bit of the FrameIndex node as known + // zero, which is important, because it means in most situations we can prove + // that values derived from FrameIndex nodes are non-negative. This enables us + // to take advantage of more addressing modes when accessing scratch buffers, + // since for scratch reads/writes, the register offset must always be + // positive. - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - if (Subtarget->enableHugeScratchBuffer()) - return TFI; + uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; + // XXX - It is unclear if partial dispatch works. Assume it works at half wave + // granularity. It is probably a full wave. + uint64_t MinGranularity = 32; + + unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); + DAG.getValueType(ExtVT)); +} + +bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { + if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + + switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { + default: return false; + case AMDGPUIntrinsic::amdgcn_if: + case AMDGPUIntrinsic::amdgcn_else: + case AMDGPUIntrinsic::amdgcn_break: + case AMDGPUIntrinsic::amdgcn_if_break: + case AMDGPUIntrinsic::amdgcn_else_break: + case AMDGPUIntrinsic::amdgcn_loop: + case AMDGPUIntrinsic::amdgcn_end_cf: + return true; + } +} + +void SITargetLowering::createDebuggerPrologueStackObjects( + MachineFunction &MF) const { + // Create stack objects that are used for emitting debugger prologue. + // + // Debugger prologue writes work group IDs and work item IDs to scratch memory + // at fixed location in the following format: + // offset 0: work group ID x + // offset 4: work group ID y + // offset 8: work group ID z + // offset 16: work item ID x + // offset 20: work item ID y + // offset 24: work item ID z + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + int ObjectIdx = 0; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Create fixed stack object for work group ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true); + Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); + // Create fixed stack object for work item ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true); + Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); + } } /// This transforms the control flow intrinsics to get the branch destination as @@ -1137,13 +1352,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); SDNode *BR = nullptr; + SDNode *SetCC = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine - SDNode *SetCC = Intr; - assert(SetCC->getConstantOperandVal(1) == 1); - assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE); + SetCC = Intr; Intr = SetCC->getOperand(0).getNode(); } else { @@ -1152,7 +1365,15 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, Target = BR->getOperand(1); } - assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + if (!isCFIntrinsic(Intr)) { + // This is a uniform branch so we don't need to legalize. + return BRCOND; + } + + assert(!SetCC || + (SetCC->getConstantOperandVal(1) == 1 && + cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE)); // Build the result and ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); @@ -1204,37 +1425,185 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::getSegmentAperture(unsigned AS, + SelectionDAG &DAG) const { + SDLoc SL; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + + // Offset into amd_queue_t for group_segment_aperture_base_hi / + // private_segment_aperture_base_hi. + uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, SL, MVT::i64)); + + // TODO: Use custom target PseudoSourceValue. + // TODO: We should use the value from the IR intrinsic call, but it might not + // be available and how do we get it? + Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), + AMDGPUAS::CONSTANT_ADDRESS)); + + MachinePointerInfo PtrInfo(V, StructOffset); + return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, + MinAlign(64, StructOffset), + MachineMemOperand::MOInvariant); +} + +SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); + + SDValue Src = ASC->getOperand(0); + + // FIXME: Really support non-0 null pointers. + SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); + SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + + // flat -> local/private + if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); + SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + + return DAG.getNode(ISD::SELECT, SL, MVT::i32, + NonNull, Ptr, SegmentNullPtr); + } + } + + // local/private -> flat + if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull + = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); + + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue CvtPtr + = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + + return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, + DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), + FlatNullPtr); + } + } + + // global <-> flat are no-ops and never emitted. + + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported InvalidAddrSpaceCast( + *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); + DAG.getContext()->diagnose(InvalidAddrSpaceCast); + + return DAG.getUNDEF(ASC->getValueType(0)); +} + +static bool shouldEmitGOTReloc(const GlobalValue *GV, + const TargetMachine &TM) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); +} + +bool +SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // We can fold offsets for anything that doesn't require a GOT relocation. + return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); +} + +static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, + SDLoc DL, unsigned Offset, EVT PtrVT, + unsigned GAFlags = SIInstrInfo::MO_NONE) { + // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is + // lowered to the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // a fixup or relocation is emitted to replace $symbol with a literal + // constant, which is a pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // What we want here is an offset from the value returned by s_getpc + // (which is the address of the s_add_u32 instruction) to the global + // variable, but since the encoding of $symbol starts 4 bytes after the start + // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order to + // compute the correct address. + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, + GAFlags); + return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA); +} + SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); const GlobalValue *GV = GSD->getGlobal(); - MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); + EVT PtrVT = Op.getValueType(); + + if (!shouldEmitGOTReloc(GV, getTargetMachine())) + return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); + + SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, + SIInstrInfo::MO_GOTPCREL); + + Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + const DataLayout &DataLayout = DAG.getDataLayout(); + unsigned Align = DataLayout.getABITypeAlignment(PtrTy); + // FIXME: Use a PseudoSourceValue once those can be assigned an address space. + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); + return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, + MachineMemOperand::MOInvariant); } -SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, - SDValue V) const { +SDValue SITargetLowering::lowerTRAP(SDValue Op, + SelectionDAG &DAG) const { + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + "trap handler not supported", + Op.getDebugLoc(), + DS_Warning); + DAG.getContext()->diagnose(NoTrap); + + // Emit s_endpgm. + + // FIXME: This should really be selected to s_trap, but that requires + // setting up the trap handler for it o do anything. + return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, + Op.getOperand(0)); +} + +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, + const SDLoc &DL, SDValue V) const { + // We can't use S_MOV_B32 directly, because there is no way to specify m0 as + // the destination register. + // // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, // so we will end up with redundant moves to m0. // - // We can't use S_MOV_B32, because there is no way to specify m0 as the - // destination register. - // - // We have to use them both. Machine cse will combine all the S_MOV_B32 - // instructions and the register coalescer eliminate the extra copies. - SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); - return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), - SDValue(M0, 0), SDValue()); // Glue - // A Null SDValue creates - // a glue result. + // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. + + // A Null SDValue creates a glue result. + SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, + V, Chain); + return SDValue(M0, 0); } SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, @@ -1249,12 +1618,27 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, DAG.getValueType(VT)); } +static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "non-hsa intrinsic with hsa target", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + +static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "intrinsic not supported on subtarget", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -1264,62 +1648,134 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_dispatch_ptr: + case Intrinsic::amdgcn_queue_ptr: { if (!Subtarget->isAmdHsaOS()) { - DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), - "hsa intrinsic without hsa target"); + DiagnosticInfoUnsupported BadIntrin( + *MF.getFunction(), "unsupported hsa intrinsic without hsa target", + DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); return DAG.getUNDEF(VT); } + auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? + SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); - + TRI->getPreloadedValue(MF, Reg), VT); + } + case Intrinsic::amdgcn_implicitarg_ptr: { + unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + } + case Intrinsic::amdgcn_kernarg_segment_ptr: { + unsigned Reg + = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + } + case Intrinsic::amdgcn_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq: + case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq_legacy: { + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + return emitRemovedIntrinsicError(DAG, DL, VT); + + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + } + case Intrinsic::amdgcn_rsq_clamp: { + if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); + + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, DL, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, DL, VT)); + } case Intrinsic::r600_read_ngroups_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); - case Intrinsic::AMDGPU_read_workdim: + case Intrinsic::amdgcn_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name. // Really only 2 bits. return lowerImplicitZextParam(DAG, Op, MVT::i8, getImplicitParameterOffset(MFI, GRID_DIM)); + case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); + case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); + case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); + case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); + case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); + case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); @@ -1336,24 +1792,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), - DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); case AMDGPUIntrinsic::SI_fs_constant: { SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); SDValue Glue = M0.getValue(1); @@ -1393,11 +1837,93 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_sin: + return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_cos: + return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_log_clamp: { + if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return SDValue(); + + DiagnosticInfoUnsupported BadIntrin( + *MF.getFunction(), "intrinsic not supported on subtarget", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + case Intrinsic::amdgcn_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::amdgcn_fract: + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); + + case Intrinsic::amdgcn_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::amdgcn_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } } +SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + MemSDNode *M = cast<MemSDNode>(Op); + unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? + AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: + return SDValue(); + } +} + SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -1439,6 +1965,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, Op->getVTList(), Ops, VT, MMO); } + case AMDGPUIntrinsic::AMDGPU_kill: { + if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) { + if (!K->isNegative()) + return Chain; + } + + return Op; + } default: return SDValue(); } @@ -1447,48 +1981,92 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT MemVT = Load->getMemoryVT(); - if (Op.getValueType().isVector()) { - assert(Op.getValueType().getVectorElementType() == MVT::i32 && - "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned NumElements = Op.getValueType().getVectorNumElements(); - assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { + assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. - switch (Load->getAddressSpace()) { - default: break; - case AMDGPUAS::CONSTANT_ADDRESS: - if (isMemOpUniform(Load)) - break; - // Non-uniform loads will be selected to MUBUF instructions, so they - // have the same legalization requires ments as global and private - // loads. - // - // Fall-through - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: - if (NumElements >= 8) - return SplitVectorLoad(Op, DAG); - - // v4 loads are supported for private and global memory. - if (NumElements <= 4) - break; - // fall-through - case AMDGPUAS::LOCAL_ADDRESS: - // If properly aligned, if we split we might be able to use ds_read_b64. + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + if (!MemVT.isVector()) + return SDValue(); + + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + + unsigned AS = Load->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + AS, Load->getAlignment())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + + unsigned NumElements = MemVT.getVectorNumElements(); + switch (AS) { + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + return SDValue(); + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) + return SplitVectorLoad(Op, DAG); + // v4 loads are supported for private and global memory. + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + // Depending on the setting of the private_element_size field in the + // resource descriptor, we can only make private accesses up to a certain + // size. + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return scalarizeVectorLoad(Load, DAG); + case 8: + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); + return SDValue(); + case 16: + // Same as global/flat + if (NumElements > 4) return SplitVectorLoad(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); } } + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); - return AMDGPUTargetLowering::LowerLOAD(Op, DAG); -} + if (NumElements == 2) + return SDValue(); -SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, - const SDValue &Op, - SelectionDAG &DAG) const { - return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4)); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); + } + default: + return SDValue(); + } } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -1514,7 +2092,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } @@ -1547,7 +2125,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { } } - if (Unsafe) { + const SDNodeFlags *Flags = Op->getFlags(); + + if (Unsafe || Flags->hasAllowReciprocal()) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) SDNodeFlags Flags; @@ -1560,45 +2140,71 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - SDValue FastLowered = LowerFastFDIV(Op, DAG); - if (FastLowered.getNode()) + if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) return FastLowered; - // This uses v_rcp_f32 which does not handle denormals. Let this hit a - // selection error for now rather than do something incorrect. - if (Subtarget->hasFP32Denormals()) - return SDValue(); - SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag + if (EnableAMDGPUFastFDIV) { + // This does not support denormals. + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + // TODO: Should this propagate fast-math-flags? - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + // rcp does not support denormals. + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + } + + // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + // Denominator is scaled to not be denormal, so using rcp is ok. + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); - // TODO: Should this propagate fast-math-flags? + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + SDValue Scale = NumeratorScaled.getValue(1); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { @@ -1635,7 +2241,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -1685,26 +2291,57 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); EVT VT = Store->getMemoryVT(); - // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - if (VT.isVector() && VT.getVectorNumElements() > 4) - return ScalarizeVectorStore(Op, DAG); - return SDValue(); + if (VT == MVT::i1) { + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) - return Ret; + assert(VT.isVector() && + Store->getValue().getValueType().getScalarType() == MVT::i32); - if (VT.isVector() && VT.getVectorNumElements() >= 8) + unsigned AS = Store->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AS, Store->getAlignment())) { + return expandUnalignedStore(Store, DAG); + } + + unsigned NumElements = VT.getVectorNumElements(); + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return scalarizeVectorStore(Store, DAG); + case 8: + if (NumElements > 2) + return SplitVectorStore(Op, DAG); + return SDValue(); + case 16: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); + } + } + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) return SplitVectorStore(Op, DAG); - if (VT == MVT::i1) - return DAG.getTruncStore(Store->getChain(), DL, - DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), - Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + if (NumElements == 2) + return Op; - return SDValue(); + // If properly aligned, if we split we might be able to use ds_write_b64. + return SplitVectorStore(Op, DAG); + } + default: + llvm_unreachable("unhandled address space"); + } } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { @@ -1727,6 +2364,33 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { } } +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); + assert(AtomicNode->isCompareAndSwap()); + unsigned AS = AtomicNode->getAddressSpace(); + + // No custom lowering required for local address space + if (!isFlatGlobalAddrSpace(AS)) + return Op; + + // Non-local address space requires custom lowering for atomic compare + // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 + SDLoc DL(Op); + SDValue ChainIn = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + SDValue Old = Op.getOperand(2); + SDValue New = Op.getOperand(3); + EVT VT = Op.getValueType(); + MVT SimpleVT = VT.getSimpleVT(); + MVT VecType = MVT::getVectorVT(SimpleVT, 2); + + SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); + SDValue Ops[] = { ChainIn, Addr, NewOld }; + + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), + Ops, VT, AtomicNode->getMemOperand()); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -1756,88 +2420,13 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, } } - // We are primarily trying to catch operations on illegal vector types - // before they are expanded. - // For scalars, we can use the more flexible method of checking masked bits - // after legalization. - if (!DCI.isBeforeLegalize() || - !SrcVT.isVector() || - SrcVT.getVectorElementType() != MVT::i8) { - return SDValue(); - } - - assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - - // Weird sized vectors are a pain to handle, but we know 3 is really the same - // size as 4. - unsigned NElts = SrcVT.getVectorNumElements(); - if (!SrcVT.isSimple() && NElts != 3) - return SDValue(); - - // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to - // prevent a mess from expanding to v4i32 and repacking. - if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); - EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast<LoadSDNode>(Src); - - unsigned AS = Load->getAddressSpace(); - unsigned Align = Load->getAlignment(); - Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - - // Don't try to replace the load if we have to expand it due to alignment - // problems. Otherwise we will end up scalarizing the load, and trying to - // repack into the vector for no real reason. - if (Align < ABIAlignment && - !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { - return SDValue(); - } - - SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, - Load->getChain(), - Load->getBasePtr(), - LoadVT, - Load->getMemOperand()); - - // Make sure successors of the original load stay after it by updating - // them to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); - - SmallVector<SDValue, 4> Elts; - if (RegVT.isVector()) - DAG.ExtractVectorElements(NewLoad, Elts); - else - Elts.push_back(NewLoad); - - SmallVector<SDValue, 4> Ops; - - unsigned EltIdx = 0; - for (SDValue Elt : Elts) { - unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); - for (unsigned I = 0; I < ComponentsInElt; ++I) { - unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; - SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); - DCI.AddToWorklist(Cvt.getNode()); - Ops.push_back(Cvt); - } - - ++EltIdx; - } - - assert(Ops.size() == NElts); - - return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); - } - return SDValue(); } /// \brief Return true if the given offset Size in bytes can be folded into /// the immediate offsets of a memory instruction for the given address space. static bool canFoldOffset(unsigned OffsetSize, unsigned AS, - const AMDGPUSubtarget &STI) { + const SISubtarget &STI) { switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: { // MUBUF instructions a 12-bit offset in bytes. @@ -1846,7 +2435,7 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS, case AMDGPUAS::CONSTANT_ADDRESS: { // SMRD instructions have an 8-bit offset in dwords on SI and // a 20-bit offset in bytes on VI. - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) return isUInt<20>(OffsetSize); else return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); @@ -1897,7 +2486,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, // If the resulting offset is too large, we can't fold it into the addressing // mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -1915,6 +2504,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, if (DCI.isBeforeLegalize()) return SDValue(); + if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI)) + return Base; + SelectionDAG &DAG = DCI.DAG; // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> @@ -1970,6 +2562,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + if (VT == MVT::i64) { + // TODO: This could be a generic combine with a predicate for extracting the + // high half of an integer being free. + + // (or i64:x, (zero_extend i32:y)) -> + // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) + if (LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() != ISD::ZERO_EXTEND) + std::swap(LHS, RHS); + + if (RHS.getOpcode() == ISD::ZERO_EXTEND) { + SDValue ExtSrc = RHS.getOperand(0); + EVT SrcVT = ExtSrc.getValueType(); + if (SrcVT == MVT::i32) { + SDLoc SL(N); + SDValue LowLHS, HiBits; + std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); + SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); + + DCI.AddToWorklist(LowOr.getNode()); + DCI.AddToWorklist(HiBits.getNode()); + + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + LowOr, HiBits); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + } + } + } + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && RHS.getOpcode() == AMDGPUISD::FP_CLASS) { @@ -2005,9 +2627,52 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return DAG.getConstant(0, SDLoc(N), MVT::i1); } + if (N->getOperand(0).isUndef()) + return DAG.getUNDEF(MVT::i1); + return SDValue(); } +// Constant fold canonicalize. +SDValue SITargetLowering::performFCanonicalizeCombine( + SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CFP) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + const APFloat &C = CFP->getValueAPF(); + + // Flush denormals to 0 if not enabled. + if (C.isDenormal()) { + EVT VT = N->getValueType(0); + if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + + if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + } + + if (C.isNaN()) { + EVT VT = N->getValueType(0); + APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); + if (C.isSignaling()) { + // Quiet a signaling NaN. + return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); + } + + // Make sure it is the canonical NaN bitpattern. + // + // TODO: Can we use -1 as the canonical NaN value since it's an inline + // immediate? + if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) + return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); + } + + return SDValue(CFP, 0); +} + static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: @@ -2027,8 +2692,64 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { +static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) { + ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); + if (!K1) + return SDValue(); + + ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + if (Signed) { + if (K0->getAPIntValue().sge(K1->getAPIntValue())) + return SDValue(); + } else { + if (K0->getAPIntValue().uge(K1->getAPIntValue())) + return SDValue(); + } + + EVT VT = K0->getValueType(0); + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); +} + +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1) { + ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); + if (!K1) + return SDValue(); + + ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + // Ordered >= (although NaN inputs should have folded away by now). + APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); + if (Cmp == APFloat::cmpGreaterThan) + return SDValue(); + + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a + // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then + // give the other result, which is different from med3 with a NaN input. + SDValue Var = Op0.getOperand(0); + if (!isKnownNeverSNan(DAG, Var)) + return SDValue(); + + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); +} + +SDValue SITargetLowering::performMinMaxCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; unsigned Opc = N->getOpcode(); @@ -2038,26 +2759,51 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - // max(max(a, b), c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + // max(max(a, b), c) -> max3(a, b, c) + // min(min(a, b), c) -> min3(a, b, c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // Try commuted. + // max(a, max(b, c)) -> max3(a, b, c) + // min(a, min(b, c)) -> min3(a, b, c) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } } - // max(a, max(b, c)) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); + // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + return Med3; + } + + if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + return Med3; + } + + // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) + if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || + (Opc == AMDGPUISD::FMIN_LEGACY && + Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && + N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) + return Res; } return SDValue(); @@ -2104,16 +2850,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); - case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: - case ISD::UMIN: { + case ISD::UMIN: + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); + return performMinMaxCombine(N, DCI); break; } @@ -2122,8 +2870,23 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CVT_F32_UBYTE2: case AMDGPUISD::CVT_F32_UBYTE3: { unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; - SDValue Src = N->getOperand(0); + + // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. + if (Src.getOpcode() == ISD::SRL) { + // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x + // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x + // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x + + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) { + unsigned SrcOffset = C->getZExtValue() + 8 * Offset; + if (SrcOffset < 32 && SrcOffset % 8 == 0) { + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL, + MVT::f32, Src.getOperand(0)); + } + } + } + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); APInt KnownZero, KnownOne; @@ -2238,7 +3001,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + case ISD::ATOMIC_LOAD_UMAX: + case AMDGPUISD::ATOMIC_INC: + case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; @@ -2264,6 +3029,19 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performOrCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); + case ISD::FCANONICALIZE: + return performFCanonicalizeCombine(N, DCI); + case AMDGPUISD::FRACT: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RSQ_CLAMP: + case AMDGPUISD::LDEXP: { + SDValue Src = N->getOperand(0); + if (Src.isUndef()) + return Src; + break; + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -2273,9 +3051,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate /// and the immediate value if it's a literal immediate int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { if (TII->isInlineConstant(Node->getAPIntValue())) @@ -2314,7 +3090,8 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { SDNode *Users[4] = { }; unsigned Lane = 0; - unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; + unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; // Try to figure out the used register components @@ -2354,8 +3131,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Adjust the writemask in the node std::vector<SDValue> Ops; + Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); - Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); + Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy @@ -2421,14 +3199,15 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + unsigned Opcode = Node->getMachineOpcode(); - if (TII->isMIMG(Node->getMachineOpcode())) + if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && + !TII->isGather4(Opcode)) adjustWritemask(Node, DAG); - if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || - Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + if (Opcode == AMDGPU::INSERT_SUBREG || + Opcode == AMDGPU::REG_SEQUENCE) { legalizeTargetIndependentNode(Node, DAG); return Node; } @@ -2437,22 +3216,22 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, /// \brief Assign the register class depending on the number of /// bits set in the writemask -void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - if (TII->isVOP3(MI->getOpcode())) { + if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); return; } - if (TII->isMIMG(*MI)) { - unsigned VReg = MI->getOperand(0).getReg(); - unsigned Writemask = MI->getOperand(1).getImm(); + if (TII->isMIMG(MI)) { + unsigned VReg = MI.getOperand(0).getReg(); + unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; + unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; for (unsigned i = 0; i < 4; ++i) BitsSet += Writemask & (1 << i) ? 1 : 0; @@ -2465,34 +3244,58 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, case 3: RC = &AMDGPU::VReg_96RegClass; break; } - unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); - MI->setDesc(TII->get(NewOpcode)); + unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); + MI.setDesc(TII->get(NewOpcode)); MRI.setRegClass(VReg, RC); return; } // Replace unused atomics with the no return version. - int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { if (!Node->hasAnyUseOfValue(0)) { - MI->setDesc(TII->get(NoRetAtomicOp)); - MI->RemoveOperand(0); + MI.setDesc(TII->get(NoRetAtomicOp)); + MI.RemoveOperand(0); + return; } + // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg + // instruction, because the return type of these instructions is a vec2 of + // the memory type, so it can be tied to the input operand. + // This means these instructions always have a use, so we need to add a + // special case to check if the atomic has only one extract_subreg use, + // which itself has no uses. + if ((Node->hasNUsesOfValue(1, 0) && + Node->use_begin()->isMachineOpcode() && + Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && + !Node->use_begin()->hasAnyUseOfValue(0))) { + unsigned Def = MI.getOperand(0).getReg(); + + // Change this into a noret atomic. + MI.setDesc(TII->get(NoRetAtomicOp)); + MI.RemoveOperand(0); + + // If we only remove the def operand from the atomic instruction, the + // extract_subreg will be left with a use of a vreg without a def. + // So we need to insert an implicit_def to avoid machine verifier + // errors. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Def); + } return; } } -static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { +static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, + uint64_t Val) { SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); } MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, - SDLoc DL, + const SDLoc &DL, SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); // Build the half of the subregister with the constants before building the // full 128-bit register. If we are building multiple resource descriptors, @@ -2524,10 +3327,8 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] /// of the resource descriptor) to create an offset, which is added to /// the resource pointer. -MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, + SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const { SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); |