aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp257
1 files changed, 190 insertions, 67 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b4b4776ad39..d68488ccb342 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -78,6 +78,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
@@ -99,9 +105,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
@@ -173,12 +185,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
@@ -198,6 +212,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v5f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::STORE, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
+
setOperationAction(ISD::STORE, MVT::v8f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
@@ -219,6 +239,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v3i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
+
+ setOperationAction(ISD::STORE, MVT::v3f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
+
setOperationAction(ISD::STORE, MVT::v4i64, Promote);
AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
@@ -261,6 +287,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
+ setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
+ setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
+
setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
@@ -325,8 +356,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
@@ -335,6 +372,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
@@ -343,6 +384,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
@@ -412,8 +455,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
static const MVT::SimpleValueType VectorIntTypes[] = {
- MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
- };
+ MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
for (MVT VT : VectorIntTypes) {
// Expand the following operations for the current type by default.
@@ -454,8 +496,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
}
static const MVT::SimpleValueType FloatVectorTypes[] = {
- MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
- };
+ MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
for (MVT VT : FloatVectorTypes) {
setOperationAction(ISD::FABS, VT, Expand);
@@ -505,6 +546,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
+ setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
+
+ setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
+
// There are no libcalls of any kind.
for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
@@ -846,9 +893,9 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
- return VT == MVT::f32 || VT == MVT::f64 ||
- (Subtarget->has16BitInsts() && VT == MVT::f16) ||
- (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
+ // Report this based on the end legalized type.
+ VT = VT.getScalarType();
+ return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -1257,8 +1304,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
- case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
- case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ return LowerFP_TO_INT(Op, DAG);
case ISD::CTTZ:
case ISD::CTTZ_ZERO_UNDEF:
case ISD::CTLZ:
@@ -1304,7 +1352,8 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isModuleEntryFunction()) {
+ if (!MFI->isModuleEntryFunction() &&
+ !GV->getName().equals("llvm.amdgcn.module.lds")) {
SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
@@ -1368,6 +1417,14 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SmallVector<SDValue, 8> Args;
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
EVT VT = Op.getValueType();
+ EVT SrcVT = Op.getOperand(0).getValueType();
+
+ // For these types, we have some TableGen patterns except if the index is 1
+ if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
+ (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
+ Start != 1)
+ return Op;
+
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
@@ -2579,33 +2636,77 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
return LowerINT_TO_FP64(Op, DAG, true);
}
-SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
bool Signed) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
- SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
+ assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
- SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
- MVT::f64);
- SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
- MVT::f64);
- // TODO: Should this propagate fast-math-flags?
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
+ // The basic idea of converting a floating point number into a pair of 32-bit
+ // integers is illustrated as follows:
+ //
+ // tf := trunc(val);
+ // hif := floor(tf * 2^-32);
+ // lof := tf - hif * 2^32; // lof is always positive due to floor.
+ // hi := fptoi(hif);
+ // lo := fptoi(lof);
+ //
+ SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
+ SDValue Sign;
+ if (Signed && SrcVT == MVT::f32) {
+ // However, a 32-bit floating point number has only 23 bits mantissa and
+ // it's not enough to hold all the significant bits of `lof` if val is
+ // negative. To avoid the loss of precision, We need to take the absolute
+ // value after truncating and flip the result back based on the original
+ // signedness.
+ Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
+ DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
+ DAG.getConstant(31, SL, MVT::i32));
+ Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
+ }
- SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
+ SDValue K0, K1;
+ if (SrcVT == MVT::f64) {
+ K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
+ SL, SrcVT);
+ K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
+ SL, SrcVT);
+ } else {
+ K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
+ SrcVT);
+ K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
+ SrcVT);
+ }
+ // TODO: Should this propagate fast-math-flags?
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
+ SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
- SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
+ SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
- SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
- MVT::i32, FloorMul);
+ SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
+ : ISD::FP_TO_UINT,
+ SL, MVT::i32, FloorMul);
SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
- SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
+ SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
+ DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
+
+ if (Signed && SrcVT == MVT::f32) {
+ assert(Sign);
+ // Flip the result based on the signedness, which is either all 0s or 1s.
+ Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
+ DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
+ // r := xor(r, sign) - sign;
+ Result =
+ DAG.getNode(ISD::SUB, SL, MVT::i64,
+ DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
+ }
- return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
+ return Result;
}
SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
@@ -2707,44 +2808,37 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con
return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
}
-SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
+ SelectionDAG &DAG) const {
SDValue Src = Op.getOperand(0);
+ unsigned OpOpcode = Op.getOpcode();
+ EVT SrcVT = Src.getValueType();
+ EVT DestVT = Op.getValueType();
- // TODO: Factor out code common with LowerFP_TO_UINT.
+ // Will be selected natively
+ if (SrcVT == MVT::f16 && DestVT == MVT::i16)
+ return Op;
- EVT SrcVT = Src.getValueType();
- if (SrcVT == MVT::f16 ||
- (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
+ // Promote i16 to i32
+ if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
SDLoc DL(Op);
- SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
- return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32);
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
}
- if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
- return LowerFP64_TO_INT(Op, DAG, true);
-
- return SDValue();
-}
-
-SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
- SelectionDAG &DAG) const {
- SDValue Src = Op.getOperand(0);
-
- // TODO: Factor out code common with LowerFP_TO_SINT.
-
- EVT SrcVT = Src.getValueType();
if (SrcVT == MVT::f16 ||
(SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
SDLoc DL(Op);
- SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src);
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32);
+ SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
+ unsigned Ext =
+ OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
}
- if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
- return LowerFP64_TO_INT(Op, DAG, false);
+ if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
+ return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
return SDValue();
}
@@ -2787,8 +2881,8 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
}
-static SDValue simplifyI24(SDNode *Node24,
- TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue simplifyMul24(SDNode *Node24,
+ TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
@@ -2890,9 +2984,8 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
- LN->getMemOperand()->getFlags(),
- &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
SDValue Ops[2];
if (VT.isVector())
@@ -2946,9 +3039,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
- SN->getMemOperand()->getFlags(),
- &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(
+ VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
if (VT.isVector())
return scalarizeVectorStore(SN, DAG);
@@ -3010,7 +3102,7 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
switch (IID) {
case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_mul_u24:
- return simplifyI24(N, DCI);
+ return simplifyMul24(N, DCI);
case Intrinsic::amdgcn_fract:
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
@@ -3312,6 +3404,13 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ if (!N->isDivergent())
+ return SDValue();
+
unsigned Size = VT.getSizeInBits();
if (VT.isVector() || Size > 64)
return SDValue();
@@ -3362,6 +3461,15 @@ SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
if (!Subtarget->hasMulI24() || VT.isVector())
return SDValue();
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ // This doesn't apply if no s_mul_hi is available (since we'll end up with a
+ // valu op anyway)
+ if (Subtarget->hasSMulHi() && !N->isDivergent())
+ return SDValue();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
@@ -3386,6 +3494,15 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
return SDValue();
+ // Don't generate 24-bit multiplies on values that are in SGPRs, since
+ // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
+ // unnecessarily). isDivergent() is used as an approximation of whether the
+ // value is in an SGPR.
+ // This doesn't apply if no s_mul_hi is available (since we'll end up with a
+ // valu op anyway)
+ if (Subtarget->hasSMulHi() && !N->isDivergent())
+ return SDValue();
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
@@ -3985,11 +4102,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case AMDGPUISD::MUL_I24:
case AMDGPUISD::MUL_U24:
case AMDGPUISD::MULHI_I24:
- case AMDGPUISD::MULHI_U24: {
- if (SDValue V = simplifyI24(N, DCI))
- return V;
- return SDValue();
- }
+ case AMDGPUISD::MULHI_U24:
+ return simplifyMul24(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
case ISD::FNEG:
@@ -4159,8 +4273,13 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
+ // Stores to the argument stack area are relative to the stack pointer.
+ SDValue SP =
+ DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
+ Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
MachineMemOperand::MODereferenceable);
return Store;
@@ -4297,7 +4416,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
- NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@@ -4350,6 +4468,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
@@ -4425,8 +4545,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
break;
}
- case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT: {
+ case AMDGPUISD::FP_TO_FP16: {
unsigned BitWidth = Known.getBitWidth();
// High bits are zero.
@@ -4573,7 +4692,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::BUFFER_LOAD_USHORT:
return 16;
case AMDGPUISD::FP_TO_FP16:
- case AMDGPUISD::FP16_ZEXT:
return 16;
default:
return 1;
@@ -4727,3 +4845,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::None;
}
}
+
+bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal(
+ unsigned Opc, LLT Ty1, LLT Ty2) const {
+ return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
+}