diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 601 |
1 files changed, 384 insertions, 217 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index aaf448346b53..340f4ac6f57a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -12,48 +12,21 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "AMDGPUArgumentUsageInfo.h" -#include "AMDGPUISelLowering.h" // For AMDGPUISD -#include "AMDGPUInstrInfo.h" -#include "AMDGPUPerfHintAnalysis.h" -#include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIDefines.h" -#include "SIISelLowering.h" -#include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/ISDOpcodes.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/CodeGen/SelectionDAGNodes.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/InitializePasses.h" + #ifdef EXPENSIVE_CHECKS +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" #endif -#include "llvm/IR/Instruction.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CodeGen.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" -#include "llvm/Support/MathExtras.h" -#include <cassert> -#include <cstdint> -#include <new> -#include <vector> #define DEBUG_TYPE "isel" @@ -191,6 +164,9 @@ private: bool isUniformLoad(const SDNode *N) const; bool isUniformBr(const SDNode *N) const; + bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS, + SDValue &RHS) const; + MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const; @@ -200,11 +176,16 @@ private: const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool isDSOffsetLegal(SDValue Base, unsigned Offset, - unsigned OffsetBits) const; + bool isDSOffsetLegal(SDValue Base, unsigned Offset) const; + bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1, + unsigned Size) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; + bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1) const; + bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0, + SDValue &Offset1, unsigned Size) const; bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, @@ -233,11 +214,11 @@ private: template <bool IsSigned> bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &Offset, SDValue &SLC) const; - bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &Offset, SDValue &SLC) const; - bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &Offset, SDValue &SLC) const; + SDValue &Offset) const; + bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset) const; + bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &Offset) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -252,11 +233,15 @@ private: bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const; + bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods, + bool AllowAbs = true) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; @@ -519,8 +504,8 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { return true; // TODO: Move into isKnownNeverNaN - if (N->getFlags().isDefined()) - return N->getFlags().hasNoNaNs(); + if (N->getFlags().hasNoNaNs()) + return true; return CurDAG->isKnownNeverNaN(N); } @@ -557,8 +542,8 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const { if (!N->isMachineOpcode()) { if (N->getOpcode() == ISD::CopyToReg) { - unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); - if (Register::isVirtualRegister(Reg)) { + Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg(); + if (Reg.isVirtual()) { MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo(); return MRI.getRegClass(Reg); } @@ -716,8 +701,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX || - Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) { + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; @@ -920,6 +904,53 @@ bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { Term->getMetadata("structurizecfg.uniform"); } +static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, + SDValue &N0, SDValue &N1) { + if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST && + Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e. + // (i64 (bitcast (v2i32 (build_vector + // (or (extract_vector_elt V, 0), OFFSET), + // (extract_vector_elt V, 1))))) + SDValue Lo = Addr.getOperand(0).getOperand(0); + if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) { + SDValue BaseLo = Lo.getOperand(0); + SDValue BaseHi = Addr.getOperand(0).getOperand(1); + // Check that split base (Lo and Hi) are extracted from the same one. + if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + BaseLo.getOperand(0) == BaseHi.getOperand(0) && + // Lo is statically extracted from index 0. + isa<ConstantSDNode>(BaseLo.getOperand(1)) && + BaseLo.getConstantOperandVal(1) == 0 && + // Hi is statically extracted from index 0. + isa<ConstantSDNode>(BaseHi.getOperand(1)) && + BaseHi.getConstantOperandVal(1) == 1) { + N0 = BaseLo.getOperand(0).getOperand(0); + N1 = Lo.getOperand(1); + return true; + } + } + } + return false; +} + +bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS, + SDValue &RHS) const { + if (CurDAG->isBaseWithConstantOffset(Addr)) { + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); + return true; + } + + if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) { + assert(LHS && RHS && isa<ConstantSDNode>(RHS)); + return true; + } + + return false; +} + StringRef AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } @@ -994,7 +1025,7 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { static const unsigned OpcMap[2][2][2] = { {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32}, - {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}}, + {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}}, {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32}, {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}}; @@ -1073,7 +1104,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { } if (IsVALU) { - unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64; CurDAG->SelectNodeTo( N, Opc, N->getVTList(), @@ -1099,7 +1130,7 @@ void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) { Ops[8] = N->getOperand(0); Ops[9] = N->getOperand(4); - CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32, N->getVTList(), Ops); + CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops); } void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) { @@ -1124,9 +1155,14 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { assert(VT == MVT::f32 || VT == MVT::f64); unsigned Opc - = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; + = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64; - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) }; + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, + // omod + SDValue Ops[8]; + SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); + SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]); + SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]); CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } @@ -1135,7 +1171,7 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32; - unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32; + unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), @@ -1143,13 +1179,11 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } -bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, - unsigned OffsetBits) const { - if ((OffsetBits == 16 && !isUInt<16>(Offset)) || - (OffsetBits == 8 && !isUInt<8>(Offset))) +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const { + if (!isUInt<16>(Offset)) return false; - if (Subtarget->hasUsableDSOffset() || + if (!Base || Subtarget->hasUsableDSOffset() || Subtarget->unsafeDSOffsetFoldingEnabled()) return true; @@ -1165,7 +1199,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { + if (isDSOffsetLegal(N0, C1->getSExtValue())) { // (add n0, c0) Base = N0; Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); @@ -1175,7 +1209,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, // sub C, x -> add (sub 0, x), C if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { int64_t ByteOffset = C->getSExtValue(); - if (isUInt<16>(ByteOffset)) { + if (isDSOffsetLegal(SDValue(), ByteOffset)) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); // XXX - This is kind of hacky. Create a dummy sub node so we can check @@ -1184,13 +1218,13 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1)); - if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + if (isDSOffsetLegal(Sub, ByteOffset)) { SmallVector<SDValue, 3> Opnds; Opnds.push_back(Zero); Opnds.push_back(Addr.getOperand(1)); // FIXME: Select to VOP3 version for with-carry. - unsigned SubOp = AMDGPU::V_SUB_I32_e32; + unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32; if (Subtarget->hasAddNoCarry()) { SubOp = AMDGPU::V_SUB_U32_e64; Opnds.push_back( @@ -1214,7 +1248,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, SDLoc DL(Addr); - if (isUInt<16>(CAddr->getZExtValue())) { + if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); @@ -1230,75 +1264,104 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, return true; } +bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0, + unsigned Offset1, + unsigned Size) const { + if (Offset0 % Size != 0 || Offset1 % Size != 0) + return false; + if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size)) + return false; + + if (!Base || Subtarget->hasUsableDSOffset() || + Subtarget->unsafeDSOffsetFoldingEnabled()) + return true; + + // On Southern Islands instruction with a negative base value and an offset + // don't seem to work. + return CurDAG->SignBitIsZero(Base); +} + // TODO: If offset is too big, put low 16-bit into offset. bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { + return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4); +} + +bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base, + SDValue &Offset0, + SDValue &Offset1) const { + return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8); +} + +bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, + SDValue &Offset0, SDValue &Offset1, + unsigned Size) const { SDLoc DL(Addr); if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - unsigned DWordOffset0 = C1->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; + unsigned OffsetValue0 = C1->getZExtValue(); + unsigned OffsetValue1 = OffsetValue0 + Size; + // (add n0, c0) - if (isDSOffsetLegal(N0, DWordOffset1, 8)) { + if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) { Base = N0; - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8); return true; } } else if (Addr.getOpcode() == ISD::SUB) { // sub C, x -> add (sub 0, x), C - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { - unsigned DWordOffset0 = C->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; + if (const ConstantSDNode *C = + dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + unsigned OffsetValue0 = C->getZExtValue(); + unsigned OffsetValue1 = OffsetValue0 + Size; - if (isUInt<8>(DWordOffset0)) { + if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) { SDLoc DL(Addr); SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); // XXX - This is kind of hacky. Create a dummy sub node so we can check // the known bits in isDSOffsetLegal. We need to emit the selected node // here, so this is thrown away. - SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, - Zero, Addr.getOperand(1)); + SDValue Sub = + CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1)); - if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) { SmallVector<SDValue, 3> Opnds; Opnds.push_back(Zero); Opnds.push_back(Addr.getOperand(1)); - unsigned SubOp = AMDGPU::V_SUB_I32_e32; + unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32; if (Subtarget->hasAddNoCarry()) { SubOp = AMDGPU::V_SUB_U32_e64; Opnds.push_back( CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit } - MachineSDNode *MachineSub - = CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds); + MachineSDNode *MachineSub = CurDAG->getMachineNode( + SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds); Base = SDValue(MachineSub, 0); - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8); return true; } } } } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { - unsigned DWordOffset0 = CAddr->getZExtValue() / 4; - unsigned DWordOffset1 = DWordOffset0 + 1; - assert(4 * DWordOffset0 == CAddr->getZExtValue()); + unsigned OffsetValue0 = CAddr->getZExtValue(); + unsigned OffsetValue1 = OffsetValue0 + Size; - if (isUInt<8>(DWordOffset0) && isUInt<8>(DWordOffset1)) { + if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); - MachineSDNode *MovZero - = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, - DL, MVT::i32, Zero); + MachineSDNode *MovZero = + CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); Base = SDValue(MovZero, 0); - Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); - Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8); return true; } } @@ -1454,22 +1517,16 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { SDLoc DL(N); - const MachineFunction &MF = CurDAG->getMachineFunction(); - const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - - if (auto FI = dyn_cast<FrameIndexSDNode>(N)) { - SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), - FI->getValueType(0)); - // If we can resolve this to a frame index access, this will be relative to - // either the stack or frame pointer SGPR. - return std::make_pair( - TFI, CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)); - } + auto *FI = dyn_cast<FrameIndexSDNode>(N); + SDValue TFI = + FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N; - // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset. - return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32)); + // We rebase the base address into an absolute stack address and hence + // use constant 0 for soffset. This value must be retained until + // frame elimination and eliminateFrameIndex will choose the appropriate + // frame register if need be. + return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32)); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, @@ -1628,155 +1685,245 @@ static MemSDNode* findMemSDNode(SDNode *N) { llvm_unreachable("cannot find MemSDNode in the pattern!"); } -static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr, - SDValue &N0, SDValue &N1) { - if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST && - Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { - // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e. - // (i64 (bitcast (v2i32 (build_vector - // (or (extract_vector_elt V, 0), OFFSET), - // (extract_vector_elt V, 1))))) - SDValue Lo = Addr.getOperand(0).getOperand(0); - if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) { - SDValue BaseLo = Lo.getOperand(0); - SDValue BaseHi = Addr.getOperand(0).getOperand(1); - // Check that split base (Lo and Hi) are extracted from the same one. - if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - BaseLo.getOperand(0) == BaseHi.getOperand(0) && - // Lo is statically extracted from index 0. - isa<ConstantSDNode>(BaseLo.getOperand(1)) && - BaseLo.getConstantOperandVal(1) == 0 && - // Hi is statically extracted from index 0. - isa<ConstantSDNode>(BaseHi.getOperand(1)) && - BaseHi.getConstantOperandVal(1) == 1) { - N0 = BaseLo.getOperand(0).getOperand(0); - N1 = Lo.getOperand(1); - return true; - } - } - } - return false; -} - template <bool IsSigned> bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { + SDValue &Offset) const { int64_t OffsetVal = 0; + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (Subtarget->hasFlatInstOffsets() && (!Subtarget->hasFlatSegmentOffsetBug() || - findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) { + AS != AMDGPUAS::FLAT_ADDRESS)) { SDValue N0, N1; - if (CurDAG->isBaseWithConstantOffset(Addr)) { - N0 = Addr.getOperand(0); - N1 = Addr.getOperand(1); - } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) { - assert(N0 && N1 && isa<ConstantSDNode>(N1)); - } - if (N0 && N1) { + if (isBaseWithConstantOffset64(Addr, N0, N1)) { uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - unsigned AS = findMemSDNode(N)->getAddressSpace(); if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { Addr = N0; OffsetVal = COffsetVal; } else { // If the offset doesn't fit, put the low bits into the offset field and // add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. SDLoc DL(N); - uint64_t ImmField; - const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); - if (IsSigned) { - ImmField = SignExtend64(COffsetVal, NumBits); - - // Don't use a negative offset field if the base offset is positive. - // Since the scheduler currently relies on the offset field, doing so - // could result in strange scheduling decisions. - - // TODO: Should we not do this in the opposite direction as well? - if (static_cast<int64_t>(COffsetVal) > 0) { - if (static_cast<int64_t>(ImmField) < 0) { - const uint64_t OffsetMask = - maskTrailingOnes<uint64_t>(NumBits - 1); - ImmField = COffsetVal & OffsetMask; - } - } - } else { - // TODO: Should we do this for a negative offset? - const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits); - ImmField = COffsetVal & OffsetMask; - } + uint64_t RemainderOffset; - uint64_t RemainderOffset = COffsetVal - ImmField; + std::tie(OffsetVal, RemainderOffset) + = TII->splitFlatOffset(COffsetVal, AS, IsSigned); - assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); - assert(RemainderOffset + ImmField == COffsetVal); - - OffsetVal = ImmField; + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector<SDValue, 3> Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base address + // is uniform and saddr is usable? + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, - MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, - MVT::i32, N0, Sub1); + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } VAddr = Addr; Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16); - SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); return true; } -bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { - return SelectFlatOffset<false>(N, Addr, VAddr, Offset, SLC); +// If this matches zero_extend i32:x, return x +static SDValue matchZExtFromI32(SDValue Op) { + if (Op.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + + SDValue ExtSrc = Op.getOperand(0); + return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); +} + +// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, + SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &Offset) const { + int64_t ImmOffset = 0; + + // Match the immediate offset first, which canonically is moved as low as + // possible. + + SDValue LHS, RHS; + if (isBaseWithConstantOffset64(Addr, LHS, RHS)) { + int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) { + Addr = LHS; + ImmOffset = COffsetVal; + } else if (!LHS->isDivergent() && COffsetVal > 0) { + SDLoc SL(N); + // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) + = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true); + + if (isUInt<32>(RemainderOffset)) { + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SL, MVT::i32, + CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); + SAddr = LHS; + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + return true; + } + } + } + + // Match the variable offset. + if (Addr.getOpcode() != ISD::ADD) { + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || + isa<ConstantSDNode>(Addr)) + return false; + + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + SAddr = Addr; + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; + } + + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); + + if (!LHS->isDivergent()) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + SAddr = LHS; + VOffset = ZextRHS; + } + } + + if (!SAddr && !RHS->isDivergent()) { + // add (zero_extend (i32 vgpr)), (i64 sgpr) + if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + SAddr = RHS; + VOffset = ZextLHS; + } + } + + if (!SAddr) + return false; + + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; } -bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset, - SDValue &SLC) const { - return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC); +// Match (32-bit SGPR base) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, + SDValue Addr, + SDValue &SAddr, + SDValue &Offset) const { + if (Addr->isDivergent()) + return false; + + SAddr = Addr; + int64_t COffsetVal = 0; + + if (CurDAG->isBaseWithConstantOffset(Addr)) { + COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); + SAddr = Addr.getOperand(0); + } + + if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) { + SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + } else if (SAddr.getOpcode() == ISD::ADD && + isa<FrameIndexSDNode>(SAddr.getOperand(0))) { + // Materialize this into a scalar move for scalar address to avoid + // readfirstlane. + auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0)); + SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), + FI->getValueType(0)); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr), + MVT::i32, TFI, SAddr.getOperand(1)), + 0); + } + + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { + int64_t RemainderOffset = COffsetVal; + int64_t ImmField = 0; + const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true); + // Use signed division by a power of two to truncate towards 0. + int64_t D = 1LL << (NumBits - 1); + RemainderOffset = (COffsetVal / D) * D; + ImmField = COffsetVal - RemainderOffset; + + assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true)); + assert(RemainderOffset + ImmField == COffsetVal); + + COffsetVal = ImmField; + + SDLoc DL(N); + SDValue AddOffset = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32, + SAddr, AddOffset), 0); + } + + Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16); + + return true; } bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, @@ -2223,11 +2370,12 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; SDValue CmpVal = Mem->getOperand(2); + SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1); // XXX - Do we care about glue operands? SDValue Ops[] = { - CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain() + CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain() }; CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); @@ -2241,8 +2389,9 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; SDValue CmpVal = Mem->getOperand(2); + SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1); SDValue Ops[] = { - CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain() + CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain() }; CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); @@ -2284,7 +2433,7 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) { SDValue PtrOffset = Ptr.getOperand(1); const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue(); - if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { + if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) { N = glueCopyToM0(N, PtrBase); Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); } @@ -2379,15 +2528,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) { SDValue Chain = N->getOperand(0); SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32); - // TODO: Can this just be removed from the instruction? - SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1); - const unsigned Opc = gwsIntrinToOpcode(IntrID); SmallVector<SDValue, 5> Ops; if (HasVSrc) Ops.push_back(N->getOperand(2)); Ops.push_back(OffsetField); - Ops.push_back(GDS); Ops.push_back(Chain); SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); @@ -2511,7 +2656,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) { } bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + bool AllowAbs) const { Mods = 0; Src = In; @@ -2520,7 +2666,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, Src = Src.getOperand(0); } - if (Src.getOpcode() == ISD::FABS) { + if (AllowAbs && Src.getOpcode() == ISD::FABS) { Mods |= SISrcMods::ABS; Src = Src.getOperand(0); } @@ -2539,6 +2685,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, return false; } +bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods; + if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) { + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + + return false; +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const { SelectVOP3Mods(In, Src, SrcMods); @@ -2563,6 +2720,16 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + SDLoc DL(In); + Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + Omod = CurDAG->getTargetConstant(0, DL, MVT::i1); + + return SelectVOP3BMods(In, Src, SrcMods); +} + bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp, SDValue &Omod) const { Src = In; |