diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 519 |
1 files changed, 325 insertions, 194 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 340f4ac6f57a..a3106ded1e38 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -107,6 +107,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool EnableLateStructurizeCFG; + // Instructions that will be lowered with a final instruction that zeros the + // high result bits. + bool fp16SrcZerosHighBits(unsigned Opc) const; + public: explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr, CodeGenOpt::Level OptLevel = CodeGenOpt::Default) @@ -188,15 +192,9 @@ private: SDValue &Offset1, unsigned Size) const; bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; + SDValue &Idxen, SDValue &Addr64) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const; - bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, SDValue &Offset, - SDValue &SLC) const; + SDValue &SOffset, SDValue &Offset) const; bool SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr, SDValue &RSrc, SDValue &VAddr, SDValue &SOffset, SDValue &ImmOffset) const; @@ -204,17 +202,17 @@ private: SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, - SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - template <bool IsSigned> + bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset, uint64_t FlatVariant) const; bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset) const; + bool SelectGlobalOffset(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset) const; + bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr, + SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, @@ -322,6 +320,16 @@ static SDValue stripBitcast(SDValue Val) { // Figure out if this is really an extract of the high 16-bits of a dword. static bool isExtractHiElt(SDValue In, SDValue &Out) { In = stripBitcast(In); + + if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) { + if (!Idx->isOne()) + return false; + Out = In.getOperand(0); + return true; + } + } + if (In.getOpcode() != ISD::TRUNCATE) return false; @@ -341,6 +349,13 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) { // Look through operations that obscure just looking at the low 16-bits of the // same register. static SDValue stripExtractLoElt(SDValue In) { + if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) { + if (Idx->isNullValue() && In.getValueSizeInBits() <= 32) + return In.getOperand(0); + } + } + if (In.getOpcode() == ISD::TRUNCATE) { SDValue Src = In.getOperand(0); if (Src.getValueType().getSizeInBits() == 32) @@ -391,6 +406,68 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { return SelectionDAGISel::runOnMachineFunction(MF); } +bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const { + // XXX - only need to list legal operations. + switch (Opc) { + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FDIV: + case ISD::FREM: + case ISD::FCANONICALIZE: + case ISD::UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::FABS: + // Fabs is lowered to a bit operation, but it's an and which will clear the + // high bits anyway. + case ISD::FSQRT: + case ISD::FSIN: + case ISD::FCOS: + case ISD::FPOWI: + case ISD::FPOW: + case ISD::FLOG: + case ISD::FLOG2: + case ISD::FLOG10: + case ISD::FEXP: + case ISD::FEXP2: + case ISD::FCEIL: + case ISD::FTRUNC: + case ISD::FRINT: + case ISD::FNEARBYINT: + case ISD::FROUND: + case ISD::FFLOOR: + case ISD::FMINNUM: + case ISD::FMAXNUM: + case AMDGPUISD::FRACT: + case AMDGPUISD::CLAMP: + case AMDGPUISD::COS_HW: + case AMDGPUISD::SIN_HW: + case AMDGPUISD::FMIN3: + case AMDGPUISD::FMAX3: + case AMDGPUISD::FMED3: + case AMDGPUISD::FMAD_FTZ: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RCP_IFLAG: + case AMDGPUISD::LDEXP: + // On gfx10, all 16-bit instructions preserve the high bits. + return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9; + case ISD::FP_ROUND: + // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the + // high bits on gfx9. + // TODO: If we had the source node we could see if the source was fma/mad + return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + case ISD::FMA: + case ISD::FMAD: + case AMDGPUISD::DIV_FIXUP: + return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + default: + // fcopysign, select and others may be lowered to 32-bit bit operations + // which don't zero the high bits. + return false; + } +} + bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const { assert(Subtarget->d16PreservesUnusedBits()); MVT VT = N->getValueType(0).getSimpleVT(); @@ -1374,13 +1451,10 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, return true; } -bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, - SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const { +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, + SDValue &SOffset, SDValue &Offset, + SDValue &Offen, SDValue &Idxen, + SDValue &Addr64) const { // Subtarget prefers to use flat instruction // FIXME: This should be a pattern predicate and not reach here if (Subtarget->useFlatForGlobal()) @@ -1388,14 +1462,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDLoc DL(Addr); - if (!GLC.getNode()) - GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - if (!SLC.getNode()) - SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); - DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); - Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1472,9 +1538,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, - SDValue &DLC, SDValue &SWZ) const { + SDValue &Offset) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1482,8 +1546,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, if (!Subtarget->hasAddr64()) return false; - if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC, SWZ)) + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); @@ -1500,21 +1563,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; } -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, - SDValue &SLC) const { - SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC, SWZ; - - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); -} - -static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { - auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>(); - return PSV && PSV->isStack(); -} - std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { SDLoc DL(N); @@ -1551,13 +1599,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits); VAddr = SDValue(MovHighBits, 0); - // In a call sequence, stores to the argument stack area are relative to the - // stack pointer. - const MachinePointerInfo &PtrInfo - = cast<MemSDNode>(Parent)->getPointerInfo(); - SOffset = isStackPtrRelative(PtrInfo) - ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) - : CurDAG->getTargetConstant(0, DL, MVT::i32); + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); return true; } @@ -1600,44 +1642,65 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, return true; } +static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) { + if (Val.getOpcode() != ISD::CopyFromReg) + return false; + auto RC = + TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg()); + return RC && TRI.isSGPRClass(RC); +} + bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { - ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr); - if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) - return false; - - SDLoc DL(Addr); + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + SDLoc DL(Addr); - SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + // CopyFromReg <sgpr> + if (IsCopyFromSGPR(*TRI, Addr)) { + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = Addr; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; + } - const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo(); + ConstantSDNode *CAddr; + if (Addr.getOpcode() == ISD::ADD) { + // Add (CopyFromReg <sgpr>) <constant> + CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1)); + if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) + return false; + if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0))) + return false; - // FIXME: Get from MachinePointerInfo? We should only be using the frame - // offset if we know this is in a call sequence. - SOffset = isStackPtrRelative(PtrInfo) - ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) - : CurDAG->getTargetConstant(0, DL, MVT::i32); + SOffset = Addr.getOperand(0); + } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) && + SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) { + // <constant> + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + } else { + return false; + } + + SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &SOffset, SDValue &Offset, - SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC, - SDValue &SWZ) const { + SDValue &SOffset, SDValue &Offset + ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC, SWZ)) + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && @@ -1656,21 +1719,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return false; } -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &Soffset, SDValue &Offset - ) const { - SDValue GLC, SLC, TFE, DLC, SWZ; - - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); -} -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &Soffset, SDValue &Offset, - SDValue &SLC) const { - SDValue GLC, TFE, DLC, SWZ; - - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); -} - // Find a load or store from corresponding pattern root. // Roots may be build_vector, bitconvert or their combinations. static MemSDNode* findMemSDNode(SDNode *N) { @@ -1685,24 +1733,25 @@ static MemSDNode* findMemSDNode(SDNode *N) { llvm_unreachable("cannot find MemSDNode in the pattern!"); } -template <bool IsSigned> -bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, - SDValue Addr, - SDValue &VAddr, - SDValue &Offset) const { +bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, + SDValue &VAddr, SDValue &Offset, + uint64_t FlatVariant) const { int64_t OffsetVal = 0; unsigned AS = findMemSDNode(N)->getAddressSpace(); - if (Subtarget->hasFlatInstOffsets() && - (!Subtarget->hasFlatSegmentOffsetBug() || - AS != AMDGPUAS::FLAT_ADDRESS)) { + bool CanHaveFlatSegmentOffsetBug = + Subtarget->hasFlatSegmentOffsetBug() && + FlatVariant == SIInstrFlags::FLAT && + (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS); + + if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) { SDValue N0, N1; if (isBaseWithConstantOffset64(Addr, N0, N1)) { - uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); + int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { Addr = N0; OffsetVal = COffsetVal; } else { @@ -1719,8 +1768,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDLoc DL(N); uint64_t RemainderOffset; - std::tie(OffsetVal, RemainderOffset) - = TII->splitFlatOffset(COffsetVal, AS, IsSigned); + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); SDValue AddOffsetLo = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); @@ -1777,6 +1826,25 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, return true; } +bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr, + SDValue &VAddr, + SDValue &Offset) const { + return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT); +} + +bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr, + SDValue &VAddr, + SDValue &Offset) const { + return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal); +} + +bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr, + SDValue &VAddr, + SDValue &Offset) const { + return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, + SIInstrFlags::FlatScratch); +} + // If this matches zero_extend i32:x, return x static SDValue matchZExtFromI32(SDValue Op) { if (Op.getOpcode() != ISD::ZERO_EXTEND) @@ -1802,126 +1870,144 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) { + if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal)) { Addr = LHS; ImmOffset = COffsetVal; - } else if (!LHS->isDivergent() && COffsetVal > 0) { - SDLoc SL(N); - // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) + - // (large_offset & MaxOffset); - int64_t SplitImmOffset, RemainderOffset; - std::tie(SplitImmOffset, RemainderOffset) - = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true); - - if (isUInt<32>(RemainderOffset)) { - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::V_MOV_B32_e32, SL, MVT::i32, - CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); - VOffset = SDValue(VMov, 0); - SAddr = LHS; - Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); - return true; + } else if (!LHS->isDivergent()) { + if (COffsetVal > 0) { + SDLoc SL(N); + // saddr + large_offset -> saddr + + // (voffset = large_offset & ~MaxOffset) + + // (large_offset & MaxOffset); + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal); + + if (isUInt<32>(RemainderOffset)) { + SDNode *VMov = CurDAG->getMachineNode( + AMDGPU::V_MOV_B32_e32, SL, MVT::i32, + CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); + SAddr = LHS; + Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); + return true; + } } + + // We are adding a 64 bit SGPR and a constant. If constant bus limit + // is 1 we would need to perform 1 or 2 extra moves for each half of + // the constant and it is better to do a scalar add and then issue a + // single VALU instruction to materialize zero. Otherwise it is less + // instructions to perform VALU adds with immediates or inline literals. + unsigned NumLiterals = + !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) + + !TII->isInlineConstant(APInt(32, COffsetVal >> 32)); + if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals) + return false; } } // Match the variable offset. - if (Addr.getOpcode() != ISD::ADD) { - if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || - isa<ConstantSDNode>(Addr)) - return false; - - // It's cheaper to materialize a single 32-bit zero for vaddr than the two - // moves required to copy a 64-bit SGPR to VGPR. - SAddr = Addr; - SDNode *VMov = CurDAG->getMachineNode( - AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, - CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); - VOffset = SDValue(VMov, 0); - Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); - return true; - } + if (Addr.getOpcode() == ISD::ADD) { + LHS = Addr.getOperand(0); + RHS = Addr.getOperand(1); - LHS = Addr.getOperand(0); - RHS = Addr.getOperand(1); + if (!LHS->isDivergent()) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + SAddr = LHS; + VOffset = ZextRHS; + } + } - if (!LHS->isDivergent()) { - // add (i64 sgpr), (zero_extend (i32 vgpr)) - if (SDValue ZextRHS = matchZExtFromI32(RHS)) { - SAddr = LHS; - VOffset = ZextRHS; + if (!SAddr && !RHS->isDivergent()) { + // add (zero_extend (i32 vgpr)), (i64 sgpr) + if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + SAddr = RHS; + VOffset = ZextLHS; + } } - } - if (!SAddr && !RHS->isDivergent()) { - // add (zero_extend (i32 vgpr)), (i64 sgpr) - if (SDValue ZextLHS = matchZExtFromI32(LHS)) { - SAddr = RHS; - VOffset = ZextLHS; + if (SAddr) { + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; } } - if (!SAddr) + if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF || + isa<ConstantSDNode>(Addr)) return false; + // It's cheaper to materialize a single 32-bit zero for vaddr than the two + // moves required to copy a 64-bit SGPR to VGPR. + SAddr = Addr; + SDNode *VMov = + CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32, + CurDAG->getTargetConstant(0, SDLoc(), MVT::i32)); + VOffset = SDValue(VMov, 0); Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); return true; } +static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { + if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) { + SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); + } else if (SAddr.getOpcode() == ISD::ADD && + isa<FrameIndexSDNode>(SAddr.getOperand(0))) { + // Materialize this into a scalar move for scalar address to avoid + // readfirstlane. + auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0)); + SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), + FI->getValueType(0)); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr), + MVT::i32, TFI, SAddr.getOperand(1)), + 0); + } + + return SAddr; +} + // Match (32-bit SGPR base) + sext(imm offset) -bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, - SDValue Addr, +bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr, SDValue &SAddr, SDValue &Offset) const { if (Addr->isDivergent()) return false; - SAddr = Addr; + SDLoc DL(Addr); + int64_t COffsetVal = 0; if (CurDAG->isBaseWithConstantOffset(Addr)) { COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); SAddr = Addr.getOperand(0); + } else { + SAddr = Addr; } - if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) { - SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); - } else if (SAddr.getOpcode() == ISD::ADD && - isa<FrameIndexSDNode>(SAddr.getOperand(0))) { - // Materialize this into a scalar move for scalar address to avoid - // readfirstlane. - auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0)); - SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(), - FI->getValueType(0)); - SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr), - MVT::i32, TFI, SAddr.getOperand(1)), - 0); - } + SAddr = SelectSAddrFI(CurDAG, SAddr); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { - int64_t RemainderOffset = COffsetVal; - int64_t ImmField = 0; - const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true); - // Use signed division by a power of two to truncate towards 0. - int64_t D = 1LL << (NumBits - 1); - RemainderOffset = (COffsetVal / D) * D; - ImmField = COffsetVal - RemainderOffset; - - assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true)); - assert(RemainderOffset + ImmField == COffsetVal); + if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch)) { + int64_t SplitImmOffset, RemainderOffset; + std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset( + COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch); - COffsetVal = ImmField; + COffsetVal = SplitImmOffset; - SDLoc DL(N); SDValue AddOffset = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32, - SAddr, AddOffset), 0); + SAddr.getOpcode() == ISD::TargetFrameIndex + ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL) + : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32); + SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32, + SAddr, AddOffset), + 0); } - Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16); + Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16); return true; } @@ -2364,35 +2450,32 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { MachineSDNode *CmpSwap = nullptr; if (Subtarget->hasAddr64()) { - SDValue SRsrc, VAddr, SOffset, Offset, SLC; + SDValue SRsrc, VAddr, SOffset, Offset; - if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { + if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) { unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN : AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN; SDValue CmpVal = Mem->getOperand(2); - SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1); + SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32); // XXX - Do we care about glue operands? - SDValue Ops[] = { - CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain() - }; + SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol, + Mem->getChain()}; CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); } } if (!CmpSwap) { - SDValue SRsrc, SOffset, Offset, SLC; - if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { + SDValue SRsrc, SOffset, Offset; + if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) { unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN : AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN; SDValue CmpVal = Mem->getOperand(2); - SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1); - SDValue Ops[] = { - CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain() - }; + SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32); + SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()}; CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); } @@ -2623,7 +2706,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) { Opcode = AMDGPU::SOFT_WQM; break; case Intrinsic::amdgcn_wwm: - Opcode = AMDGPU::WWM; + case Intrinsic::amdgcn_strict_wwm: + Opcode = AMDGPU::STRICT_WWM; + break; + case Intrinsic::amdgcn_strict_wqm: + Opcode = AMDGPU::STRICT_WQM; break; case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); @@ -2773,18 +2860,62 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, if (isExtractHiElt(Hi, Hi)) Mods |= SISrcMods::OP_SEL_1; + unsigned VecSize = Src.getValueSizeInBits(); Lo = stripExtractLoElt(Lo); Hi = stripExtractLoElt(Hi); + if (Lo.getValueSizeInBits() > VecSize) { + Lo = CurDAG->getTargetExtractSubreg( + (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), + MVT::getIntegerVT(VecSize), Lo); + } + + if (Hi.getValueSizeInBits() > VecSize) { + Hi = CurDAG->getTargetExtractSubreg( + (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In), + MVT::getIntegerVT(VecSize), Hi); + } + + assert(Lo.getValueSizeInBits() <= VecSize && + Hi.getValueSizeInBits() <= VecSize); + if (Lo == Hi && !isInlineImmediate(Lo.getNode())) { // Really a scalar input. Just select from the low half of the register to // avoid packing. - Src = Lo; + if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) { + Src = Lo; + } else { + assert(Lo.getValueSizeInBits() == 32 && VecSize == 64); + + SDLoc SL(In); + SDValue Undef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL, + Lo.getValueType()), 0); + auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID + : AMDGPU::SReg_64RegClassID; + const SDValue Ops[] = { + CurDAG->getTargetConstant(RC, SL, MVT::i32), + Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) }; + + Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL, + Src.getValueType(), Ops), 0); + } SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } + if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) { + uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF() + .bitcastToAPInt().getZExtValue(); + if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) { + Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } + } + Mods = VecMods; } |