aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp519
1 files changed, 325 insertions, 194 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 340f4ac6f57a..a3106ded1e38 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -107,6 +107,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool EnableLateStructurizeCFG;
+ // Instructions that will be lowered with a final instruction that zeros the
+ // high result bits.
+ bool fp16SrcZerosHighBits(unsigned Opc) const;
+
public:
explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
@@ -188,15 +192,9 @@ private:
SDValue &Offset1, unsigned Size) const;
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
+ SDValue &Idxen, SDValue &Addr64) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
- SDValue &SOffset, SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const;
- bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
- SDValue &SLC) const;
+ SDValue &SOffset, SDValue &Offset) const;
bool SelectMUBUFScratchOffen(SDNode *Parent,
SDValue Addr, SDValue &RSrc, SDValue &VAddr,
SDValue &SOffset, SDValue &ImmOffset) const;
@@ -204,17 +202,17 @@ private:
SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
- SDValue &Offset, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
- bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
- SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
- template <bool IsSigned>
+ bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, uint64_t FlatVariant) const;
bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset) const;
+ bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset) const;
+ bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
@@ -322,6 +320,16 @@ static SDValue stripBitcast(SDValue Val) {
// Figure out if this is really an extract of the high 16-bits of a dword.
static bool isExtractHiElt(SDValue In, SDValue &Out) {
In = stripBitcast(In);
+
+ if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+ if (!Idx->isOne())
+ return false;
+ Out = In.getOperand(0);
+ return true;
+ }
+ }
+
if (In.getOpcode() != ISD::TRUNCATE)
return false;
@@ -341,6 +349,13 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
// Look through operations that obscure just looking at the low 16-bits of the
// same register.
static SDValue stripExtractLoElt(SDValue In) {
+ if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
+ if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
+ return In.getOperand(0);
+ }
+ }
+
if (In.getOpcode() == ISD::TRUNCATE) {
SDValue Src = In.getOperand(0);
if (Src.getValueType().getSizeInBits() == 32)
@@ -391,6 +406,68 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
return SelectionDAGISel::runOnMachineFunction(MF);
}
+bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
+ // XXX - only need to list legal operations.
+ switch (Opc) {
+ case ISD::FADD:
+ case ISD::FSUB:
+ case ISD::FMUL:
+ case ISD::FDIV:
+ case ISD::FREM:
+ case ISD::FCANONICALIZE:
+ case ISD::UINT_TO_FP:
+ case ISD::SINT_TO_FP:
+ case ISD::FABS:
+ // Fabs is lowered to a bit operation, but it's an and which will clear the
+ // high bits anyway.
+ case ISD::FSQRT:
+ case ISD::FSIN:
+ case ISD::FCOS:
+ case ISD::FPOWI:
+ case ISD::FPOW:
+ case ISD::FLOG:
+ case ISD::FLOG2:
+ case ISD::FLOG10:
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FCEIL:
+ case ISD::FTRUNC:
+ case ISD::FRINT:
+ case ISD::FNEARBYINT:
+ case ISD::FROUND:
+ case ISD::FFLOOR:
+ case ISD::FMINNUM:
+ case ISD::FMAXNUM:
+ case AMDGPUISD::FRACT:
+ case AMDGPUISD::CLAMP:
+ case AMDGPUISD::COS_HW:
+ case AMDGPUISD::SIN_HW:
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMED3:
+ case AMDGPUISD::FMAD_FTZ:
+ case AMDGPUISD::RCP:
+ case AMDGPUISD::RSQ:
+ case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::LDEXP:
+ // On gfx10, all 16-bit instructions preserve the high bits.
+ return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
+ case ISD::FP_ROUND:
+ // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
+ // high bits on gfx9.
+ // TODO: If we had the source node we could see if the source was fma/mad
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ case ISD::FMA:
+ case ISD::FMAD:
+ case AMDGPUISD::DIV_FIXUP:
+ return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
+ default:
+ // fcopysign, select and others may be lowered to 32-bit bit operations
+ // which don't zero the high bits.
+ return false;
+ }
+}
+
bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
assert(Subtarget->d16PreservesUnusedBits());
MVT VT = N->getValueType(0).getSimpleVT();
@@ -1374,13 +1451,10 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
return true;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset, SDValue &Offen,
- SDValue &Idxen, SDValue &Addr64,
- SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const {
+bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &Offen, SDValue &Idxen,
+ SDValue &Addr64) const {
// Subtarget prefers to use flat instruction
// FIXME: This should be a pattern predicate and not reach here
if (Subtarget->useFlatForGlobal())
@@ -1388,14 +1462,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDLoc DL(Addr);
- if (!GLC.getNode())
- GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- if (!SLC.getNode())
- SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
- DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
- SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
-
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1472,9 +1538,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE,
- SDValue &DLC, SDValue &SWZ) const {
+ SDValue &Offset) const {
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
@@ -1482,8 +1546,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
if (!Subtarget->hasAddr64())
return false;
- if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC, SWZ))
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1500,21 +1563,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return false;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
- SDValue &VAddr, SDValue &SOffset,
- SDValue &Offset,
- SDValue &SLC) const {
- SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
- SDValue GLC, TFE, DLC, SWZ;
-
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
- auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
- return PSV && PSV->isStack();
-}
-
std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
SDLoc DL(N);
@@ -1551,13 +1599,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
VAddr = SDValue(MovHighBits, 0);
- // In a call sequence, stores to the argument stack area are relative to the
- // stack pointer.
- const MachinePointerInfo &PtrInfo
- = cast<MemSDNode>(Parent)->getPointerInfo();
- SOffset = isStackPtrRelative(PtrInfo)
- ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
- : CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
return true;
}
@@ -1600,44 +1642,65 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
return true;
}
+static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
+ if (Val.getOpcode() != ISD::CopyFromReg)
+ return false;
+ auto RC =
+ TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
+ return RC && TRI.isSGPRClass(RC);
+}
+
bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
- ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
- if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
- return false;
-
- SDLoc DL(Addr);
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ SDLoc DL(Addr);
- SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ // CopyFromReg <sgpr>
+ if (IsCopyFromSGPR(*TRI, Addr)) {
+ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+ SOffset = Addr;
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ return true;
+ }
- const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
+ ConstantSDNode *CAddr;
+ if (Addr.getOpcode() == ISD::ADD) {
+ // Add (CopyFromReg <sgpr>) <constant>
+ CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
+ return false;
+ if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
+ return false;
- // FIXME: Get from MachinePointerInfo? We should only be using the frame
- // offset if we know this is in a call sequence.
- SOffset = isStackPtrRelative(PtrInfo)
- ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
- : CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SOffset = Addr.getOperand(0);
+ } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
+ SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
+ // <constant>
+ SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ } else {
+ return false;
+ }
+
+ SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &SOffset, SDValue &Offset,
- SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC,
- SDValue &SWZ) const {
+ SDValue &SOffset, SDValue &Offset
+ ) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
- if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC, SWZ))
+ if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1656,21 +1719,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
return false;
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &Soffset, SDValue &Offset
- ) const {
- SDValue GLC, SLC, TFE, DLC, SWZ;
-
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &Soffset, SDValue &Offset,
- SDValue &SLC) const {
- SDValue GLC, TFE, DLC, SWZ;
-
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
-}
-
// Find a load or store from corresponding pattern root.
// Roots may be build_vector, bitconvert or their combinations.
static MemSDNode* findMemSDNode(SDNode *N) {
@@ -1685,24 +1733,25 @@ static MemSDNode* findMemSDNode(SDNode *N) {
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
-template <bool IsSigned>
-bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset) const {
+bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
+ SDValue &VAddr, SDValue &Offset,
+ uint64_t FlatVariant) const {
int64_t OffsetVal = 0;
unsigned AS = findMemSDNode(N)->getAddressSpace();
- if (Subtarget->hasFlatInstOffsets() &&
- (!Subtarget->hasFlatSegmentOffsetBug() ||
- AS != AMDGPUAS::FLAT_ADDRESS)) {
+ bool CanHaveFlatSegmentOffsetBug =
+ Subtarget->hasFlatSegmentOffsetBug() &&
+ FlatVariant == SIInstrFlags::FLAT &&
+ (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
+
+ if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1)) {
- uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+ int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
Addr = N0;
OffsetVal = COffsetVal;
} else {
@@ -1719,8 +1768,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDLoc DL(N);
uint64_t RemainderOffset;
- std::tie(OffsetVal, RemainderOffset)
- = TII->splitFlatOffset(COffsetVal, AS, IsSigned);
+ std::tie(OffsetVal, RemainderOffset) =
+ TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
SDValue AddOffsetLo =
getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
@@ -1777,6 +1826,25 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
+}
+
+bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset) const {
+ return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
+ SIInstrFlags::FlatScratch);
+}
+
// If this matches zero_extend i32:x, return x
static SDValue matchZExtFromI32(SDValue Op) {
if (Op.getOpcode() != ISD::ZERO_EXTEND)
@@ -1802,126 +1870,144 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
+ if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal)) {
Addr = LHS;
ImmOffset = COffsetVal;
- } else if (!LHS->isDivergent() && COffsetVal > 0) {
- SDLoc SL(N);
- // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
- // (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset)
- = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
-
- if (isUInt<32>(RemainderOffset)) {
- SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
- CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
- VOffset = SDValue(VMov, 0);
- SAddr = LHS;
- Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
- return true;
+ } else if (!LHS->isDivergent()) {
+ if (COffsetVal > 0) {
+ SDLoc SL(N);
+ // saddr + large_offset -> saddr +
+ // (voffset = large_offset & ~MaxOffset) +
+ // (large_offset & MaxOffset);
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+
+ if (isUInt<32>(RemainderOffset)) {
+ SDNode *VMov = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
+ SAddr = LHS;
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ return true;
+ }
}
+
+ // We are adding a 64 bit SGPR and a constant. If constant bus limit
+ // is 1 we would need to perform 1 or 2 extra moves for each half of
+ // the constant and it is better to do a scalar add and then issue a
+ // single VALU instruction to materialize zero. Otherwise it is less
+ // instructions to perform VALU adds with immediates or inline literals.
+ unsigned NumLiterals =
+ !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
+ !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
+ if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
+ return false;
}
}
// Match the variable offset.
- if (Addr.getOpcode() != ISD::ADD) {
- if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
- isa<ConstantSDNode>(Addr))
- return false;
-
- // It's cheaper to materialize a single 32-bit zero for vaddr than the two
- // moves required to copy a 64-bit SGPR to VGPR.
- SAddr = Addr;
- SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
- CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
- VOffset = SDValue(VMov, 0);
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
- return true;
- }
+ if (Addr.getOpcode() == ISD::ADD) {
+ LHS = Addr.getOperand(0);
+ RHS = Addr.getOperand(1);
- LHS = Addr.getOperand(0);
- RHS = Addr.getOperand(1);
+ if (!LHS->isDivergent()) {
+ // add (i64 sgpr), (zero_extend (i32 vgpr))
+ if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+ SAddr = LHS;
+ VOffset = ZextRHS;
+ }
+ }
- if (!LHS->isDivergent()) {
- // add (i64 sgpr), (zero_extend (i32 vgpr))
- if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
- SAddr = LHS;
- VOffset = ZextRHS;
+ if (!SAddr && !RHS->isDivergent()) {
+ // add (zero_extend (i32 vgpr)), (i64 sgpr)
+ if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+ SAddr = RHS;
+ VOffset = ZextLHS;
+ }
}
- }
- if (!SAddr && !RHS->isDivergent()) {
- // add (zero_extend (i32 vgpr)), (i64 sgpr)
- if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
- SAddr = RHS;
- VOffset = ZextLHS;
+ if (SAddr) {
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ return true;
}
}
- if (!SAddr)
+ if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
+ isa<ConstantSDNode>(Addr))
return false;
+ // It's cheaper to materialize a single 32-bit zero for vaddr than the two
+ // moves required to copy a 64-bit SGPR to VGPR.
+ SAddr = Addr;
+ SDNode *VMov =
+ CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
+ CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
+ VOffset = SDValue(VMov, 0);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
return true;
}
+static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
+ if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
+ SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+ } else if (SAddr.getOpcode() == ISD::ADD &&
+ isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
+ // Materialize this into a scalar move for scalar address to avoid
+ // readfirstlane.
+ auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+ FI->getValueType(0));
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
+ MVT::i32, TFI, SAddr.getOperand(1)),
+ 0);
+ }
+
+ return SAddr;
+}
+
// Match (32-bit SGPR base) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
- SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
SDValue &SAddr,
SDValue &Offset) const {
if (Addr->isDivergent())
return false;
- SAddr = Addr;
+ SDLoc DL(Addr);
+
int64_t COffsetVal = 0;
if (CurDAG->isBaseWithConstantOffset(Addr)) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
+ } else {
+ SAddr = Addr;
}
- if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
- SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
- } else if (SAddr.getOpcode() == ISD::ADD &&
- isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
- // Materialize this into a scalar move for scalar address to avoid
- // readfirstlane.
- auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
- SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
- FI->getValueType(0));
- SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
- MVT::i32, TFI, SAddr.getOperand(1)),
- 0);
- }
+ SAddr = SelectSAddrFI(CurDAG, SAddr);
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
- int64_t RemainderOffset = COffsetVal;
- int64_t ImmField = 0;
- const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
- // Use signed division by a power of two to truncate towards 0.
- int64_t D = 1LL << (NumBits - 1);
- RemainderOffset = (COffsetVal / D) * D;
- ImmField = COffsetVal - RemainderOffset;
-
- assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
- assert(RemainderOffset + ImmField == COffsetVal);
+ if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)) {
+ int64_t SplitImmOffset, RemainderOffset;
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
- COffsetVal = ImmField;
+ COffsetVal = SplitImmOffset;
- SDLoc DL(N);
SDValue AddOffset =
- getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
- SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
- SAddr, AddOffset), 0);
+ SAddr.getOpcode() == ISD::TargetFrameIndex
+ ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
+ : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
+ SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
+ SAddr, AddOffset),
+ 0);
}
- Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
return true;
}
@@ -2364,35 +2450,32 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
MachineSDNode *CmpSwap = nullptr;
if (Subtarget->hasAddr64()) {
- SDValue SRsrc, VAddr, SOffset, Offset, SLC;
+ SDValue SRsrc, VAddr, SOffset, Offset;
- if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
+ if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
SDValue CmpVal = Mem->getOperand(2);
- SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
+ SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
// XXX - Do we care about glue operands?
- SDValue Ops[] = {
- CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
- };
+ SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
+ Mem->getChain()};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
}
}
if (!CmpSwap) {
- SDValue SRsrc, SOffset, Offset, SLC;
- if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
+ SDValue SRsrc, SOffset, Offset;
+ if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
SDValue CmpVal = Mem->getOperand(2);
- SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
- SDValue Ops[] = {
- CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
- };
+ SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
+ SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
}
@@ -2623,7 +2706,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
Opcode = AMDGPU::SOFT_WQM;
break;
case Intrinsic::amdgcn_wwm:
- Opcode = AMDGPU::WWM;
+ case Intrinsic::amdgcn_strict_wwm:
+ Opcode = AMDGPU::STRICT_WWM;
+ break;
+ case Intrinsic::amdgcn_strict_wqm:
+ Opcode = AMDGPU::STRICT_WQM;
break;
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
@@ -2773,18 +2860,62 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
if (isExtractHiElt(Hi, Hi))
Mods |= SISrcMods::OP_SEL_1;
+ unsigned VecSize = Src.getValueSizeInBits();
Lo = stripExtractLoElt(Lo);
Hi = stripExtractLoElt(Hi);
+ if (Lo.getValueSizeInBits() > VecSize) {
+ Lo = CurDAG->getTargetExtractSubreg(
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Lo);
+ }
+
+ if (Hi.getValueSizeInBits() > VecSize) {
+ Hi = CurDAG->getTargetExtractSubreg(
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Hi);
+ }
+
+ assert(Lo.getValueSizeInBits() <= VecSize &&
+ Hi.getValueSizeInBits() <= VecSize);
+
if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
// Really a scalar input. Just select from the low half of the register to
// avoid packing.
- Src = Lo;
+ if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
+ Src = Lo;
+ } else {
+ assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
+
+ SDLoc SL(In);
+ SDValue Undef = SDValue(
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
+ Lo.getValueType()), 0);
+ auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
+ : AMDGPU::SReg_64RegClassID;
+ const SDValue Ops[] = {
+ CurDAG->getTargetConstant(RC, SL, MVT::i32),
+ Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
+
+ Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
+ Src.getValueType(), Ops), 0);
+ }
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
+ if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
+ uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
+ .bitcastToAPInt().getZExtValue();
+ if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
+ Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+ }
+
Mods = VecMods;
}