aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp1484
1 files changed, 672 insertions, 812 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1a59a460ee7d..352423ed3ad6 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -15,7 +15,6 @@
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
-#include "AMDGPUDiagnosticInfoUnsupported.h"
#include "AMDGPUFrameLowering.h"
#include "AMDGPUIntrinsicInfo.h"
#include "AMDGPURegisterInfo.h"
@@ -28,16 +27,19 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DataLayout.h"
-
+#include "llvm/IR/DiagnosticInfo.h"
+#include "SIInstrInfo.h"
using namespace llvm;
-static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- unsigned Offset = State.AllocateStack(ValVT.getStoreSize(),
- ArgFlags.getOrigAlign());
- State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State) {
+ MachineFunction &MF = State.getMachineFunction();
+ AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+ uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(),
+ ArgFlags.getOrigAlign());
+ State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
return true;
}
@@ -53,60 +55,104 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-// Type for a vector that will be loaded to.
-EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) {
+EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) {
unsigned StoreSize = VT.getStoreSizeInBits();
if (StoreSize <= 32)
- return EVT::getIntegerVT(Ctx, 32);
+ return EVT::getIntegerVT(Ctx, StoreSize);
return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
}
-AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
+AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
const AMDGPUSubtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
- setOperationAction(ISD::Constant, MVT::i32, Legal);
- setOperationAction(ISD::Constant, MVT::i64, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
- setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+ // Lower floating point store/load to integer store/load to reduce the number
+ // of patterns in tablegen.
+ setOperationAction(ISD::LOAD, MVT::f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
- setOperationAction(ISD::BR_JT, MVT::Other, Expand);
- setOperationAction(ISD::BRIND, MVT::Other, Expand);
+ setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
- // This is totally unsupported, just custom lower to produce an error.
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
- // We need to custom lower some of the intrinsics
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
- // Library functions. These default to Expand, but we have instructions
- // for them.
- setOperationAction(ISD::FCEIL, MVT::f32, Legal);
- setOperationAction(ISD::FEXP2, MVT::f32, Legal);
- setOperationAction(ISD::FPOW, MVT::f32, Legal);
- setOperationAction(ISD::FLOG2, MVT::f32, Legal);
- setOperationAction(ISD::FABS, MVT::f32, Legal);
- setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
- setOperationAction(ISD::FRINT, MVT::f32, Legal);
- setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
- setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
- setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
- setOperationAction(ISD::FROUND, MVT::f32, Custom);
- setOperationAction(ISD::FROUND, MVT::f64, Custom);
+ setOperationAction(ISD::LOAD, MVT::i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
- setOperationAction(ISD::FREM, MVT::f32, Custom);
- setOperationAction(ISD::FREM, MVT::f64, Custom);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
- // v_mad_f32 does not support denormals according to some sources.
- if (!Subtarget->hasFP32Denormals())
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
+ setOperationAction(ISD::LOAD, MVT::f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
- // Expand to fneg + fadd.
- setOperationAction(ISD::FSUB, MVT::f64, Expand);
+ setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+
+ // There are no 64-bit extloads. These should be done as a 32-bit extload and
+ // an extension to 64-bit.
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
+ }
+
+ for (MVT VT : MVT::integer_valuetypes()) {
+ if (VT == MVT::i64)
+ continue;
+
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
+ }
+
+ for (MVT VT : MVT::integer_vector_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
+ }
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
- // Lower floating point store/load to integer store/load to reduce the number
- // of patterns in tablegen.
setOperationAction(ISD::STORE, MVT::f32, Promote);
AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
@@ -122,51 +168,99 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
+
+ setOperationAction(ISD::STORE, MVT::v2i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
+
setOperationAction(ISD::STORE, MVT::f64, Promote);
- AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+ AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
- AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+ AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
- // Custom lowering of vector stores is required for local address space
- // stores.
- setOperationAction(ISD::STORE, MVT::v4i32, Custom);
-
- setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
- // XXX: This can be change to Custom, once ExpandVectorStores can
- // handle 64-bit stores.
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
- setTruncStoreAction(MVT::i64, MVT::i16, Expand);
- setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i8, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+ setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+
setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
- setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+ setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
+ setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
- setOperationAction(ISD::LOAD, MVT::f32, Promote);
- AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f32, Expand);
- setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
- AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+ setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
+ setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
- setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
- AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+ setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
+ setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
- setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
- AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+ setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
+ setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
- setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
- AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
- setOperationAction(ISD::LOAD, MVT::f64, Promote);
- AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+ setOperationAction(ISD::Constant, MVT::i32, Legal);
+ setOperationAction(ISD::Constant, MVT::i64, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
- setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
- AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+ setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+ setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+ // This is totally unsupported, just custom lower to produce an error.
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+
+ // We need to custom lower some of the intrinsics
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+ // Library functions. These default to Expand, but we have instructions
+ // for them.
+ setOperationAction(ISD::FCEIL, MVT::f32, Legal);
+ setOperationAction(ISD::FEXP2, MVT::f32, Legal);
+ setOperationAction(ISD::FPOW, MVT::f32, Legal);
+ setOperationAction(ISD::FLOG2, MVT::f32, Legal);
+ setOperationAction(ISD::FABS, MVT::f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+ setOperationAction(ISD::FRINT, MVT::f32, Legal);
+ setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
+ setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
+ setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
+
+ setOperationAction(ISD::FROUND, MVT::f32, Custom);
+ setOperationAction(ISD::FROUND, MVT::f64, Custom);
+
+ setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
+ setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
+
+ setOperationAction(ISD::FREM, MVT::f32, Custom);
+ setOperationAction(ISD::FREM, MVT::f64, Custom);
+
+ // v_mad_f32 does not support denormals according to some sources.
+ if (!Subtarget->hasFP32Denormals())
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
+
+ // Expand to fneg + fadd.
+ setOperationAction(ISD::FSUB, MVT::f64, Expand);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
@@ -179,31 +273,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
- // There are no 64-bit extloads. These should be done as a 32-bit extload and
- // an extension to 64-bit.
- for (MVT VT : MVT::integer_valuetypes()) {
- setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
- }
-
- for (MVT VT : MVT::integer_vector_valuetypes()) {
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
- setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
- setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
- }
-
- setOperationAction(ISD::BR_CC, MVT::i1, Expand);
-
if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
setOperationAction(ISD::FCEIL, MVT::f64, Custom);
setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
@@ -219,28 +288,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
-
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
-
- setTruncStoreAction(MVT::f32, MVT::f16, Expand);
- setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
- setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
- setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
-
- setTruncStoreAction(MVT::f64, MVT::f16, Expand);
- setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
for (MVT VT : ScalarIntVTs) {
- setOperationAction(ISD::SREM, VT, Expand);
+ // These should use [SU]DIVREM, so set them to expand
setOperationAction(ISD::SDIV, VT, Expand);
+ setOperationAction(ISD::UDIV, VT, Expand);
+ setOperationAction(ISD::SREM, VT, Expand);
+ setOperationAction(ISD::UREM, VT, Expand);
// GPU does not have divrem function for signed or unsigned.
setOperationAction(ISD::SDIVREM, VT, Custom);
@@ -284,17 +338,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
if (Subtarget->hasFFBH())
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
- else
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-
- if (!Subtarget->hasFFBL())
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+ if (Subtarget->hasFFBL())
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
setOperationAction(ISD::CTLZ, MVT::i64, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+ // We only really have 32-bit BFE instructions (and 16-bit on VI).
+ //
+ // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
+ // effort to match them now. We want this to be false for i64 cases when the
+ // extraction isn't restricted to the upper or lower half. Ideally we would
+ // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
+ // span the midpoint are probably relatively rare, so don't worry about them
+ // for now.
+ if (Subtarget->hasBFE())
+ setHasExtractBitsInsn(true);
+
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v4i32
};
@@ -334,9 +395,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::BSWAP, VT, Expand);
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
- setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
- setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
}
@@ -366,24 +425,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FSUB, VT, Expand);
setOperationAction(ISD::FNEG, VT, Expand);
- setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
}
- setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
- setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
-
- setTargetDAGCombine(ISD::SHL);
- setTargetDAGCombine(ISD::MUL);
- setTargetDAGCombine(ISD::SELECT);
- setTargetDAGCombine(ISD::SELECT_CC);
- setTargetDAGCombine(ISD::STORE);
+ // This causes using an unrolled select operation rather than expansion with
+ // bit operations. This is in general better, but the alternative using BFI
+ // instructions may be better if the select sources are SGPRs.
+ setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
- setTargetDAGCombine(ISD::FADD);
- setTargetDAGCombine(ISD::FSUB);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
+ AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
setBooleanContents(ZeroOrNegativeOneBooleanContent);
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
@@ -394,7 +449,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
// SI at least has hardware support for floating point exceptions, but no way
// of using or handling them is implemented. They are also optional in OpenCL
// (Section 7.3)
- setHasFloatingPointExceptions(false);
+ setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
setSelectIsExpensive(false);
PredictableSelectIsExpensive = false;
@@ -415,6 +470,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
MaxStoresPerMemcpy = 4096;
MaxStoresPerMemmove = 4096;
MaxStoresPerMemset = 4096;
+
+ setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::AND);
+ setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::SRA);
+ setTargetDAGCombine(ISD::SRL);
+ setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::SELECT);
+ setTargetDAGCombine(ISD::SELECT_CC);
+ setTargetDAGCombine(ISD::STORE);
+ setTargetDAGCombine(ISD::FADD);
+ setTargetDAGCombine(ISD::FSUB);
}
//===----------------------------------------------------------------------===//
@@ -467,15 +534,17 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
EVT CastTy) const {
- if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
- return true;
- unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
- unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+ assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
- return ((LScalarSize <= CastScalarSize) ||
- (CastScalarSize >= 32) ||
- (LScalarSize < 32));
+ if (LoadTy.getScalarType() == MVT::i32)
+ return false;
+
+ unsigned LScalarSize = LoadTy.getScalarSizeInBits();
+ unsigned CastScalarSize = CastTy.getScalarSizeInBits();
+
+ return (LScalarSize < CastScalarSize) ||
+ (CastScalarSize >= 32);
}
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -578,14 +647,13 @@ void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
State.AnalyzeReturn(Outs, RetCC_SI);
}
-SDValue AMDGPUTargetLowering::LowerReturn(
- SDValue Chain,
- CallingConv::ID CallConv,
- bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- SDLoc DL, SelectionDAG &DAG) const {
- return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+SDValue
+AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+ bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SDLoc &DL, SelectionDAG &DAG) const {
+ return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
}
//===---------------------------------------------------------------------===//
@@ -606,32 +674,38 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
FuncName = G->getGlobal()->getName();
- DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+ DiagnosticInfoUnsupported NoCalls(
+ Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
DAG.getContext()->diagnose(NoCalls);
- return SDValue();
+
+ for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+ InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+
+ return DAG.getEntryNode();
}
SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
const Function &Fn = *DAG.getMachineFunction().getFunction();
- DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca");
+ DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
+ SDLoc(Op).getDebugLoc());
DAG.getContext()->diagnose(NoDynamicAlloca);
- return SDValue();
+ auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
+ return DAG.getMergeValues(Ops, SDLoc());
}
SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
- Op.getNode()->dump();
+ Op->dump(&DAG);
llvm_unreachable("Custom lowering code for this"
"instruction is not implemented yet!");
break;
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
- case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
@@ -666,24 +740,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
// nothing here and let the illegal result integer be handled normally.
return;
- case ISD::LOAD: {
- SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
- if (!Node)
- return;
-
- Results.push_back(SDValue(Node, 0));
- Results.push_back(SDValue(Node, 1));
- // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
- // function
- DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
- return;
- }
- case ISD::STORE: {
- SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG);
- if (Lowered.getNode())
- Results.push_back(Lowered);
- return;
- }
default:
return;
}
@@ -712,16 +768,16 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
EVT VT = EVT::getEVT(InitTy);
PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)), false,
- false, TD.getPrefTypeAlignment(InitTy));
+ MachinePointerInfo(UndefValue::get(PtrTy)),
+ TD.getPrefTypeAlignment(InitTy));
}
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
EVT VT = EVT::getEVT(CFP->getType());
PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)), false,
- false, TD.getPrefTypeAlignment(CFP->getType()));
+ MachinePointerInfo(UndefValue::get(PtrTy)),
+ TD.getPrefTypeAlignment(CFP->getType()));
}
if (StructType *ST = dyn_cast<StructType>(InitTy)) {
@@ -769,8 +825,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
EVT VT = EVT::getEVT(InitTy);
PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS);
return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr,
- MachinePointerInfo(UndefValue::get(PtrTy)), false,
- false, TD.getPrefTypeAlignment(InitTy));
+ MachinePointerInfo(UndefValue::get(PtrTy)),
+ TD.getPrefTypeAlignment(InitTy));
}
Init->dump();
@@ -782,10 +838,7 @@ static bool hasDefinedInitializer(const GlobalValue *GV) {
if (!GVar || !GVar->hasInitializer())
return false;
- if (isa<UndefValue>(GVar->getInitializer()))
- return false;
-
- return true;
+ return !isa<UndefValue>(GVar->getInitializer());
}
SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
@@ -797,6 +850,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
const GlobalValue *GV = G->getGlobal();
switch (G->getAddressSpace()) {
+ case AMDGPUAS::CONSTANT_ADDRESS: {
+ MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+ SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT);
+ return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA);
+ }
case AMDGPUAS::LOCAL_ADDRESS: {
// XXX: What does the value of G->getOffset() mean?
assert(G->getOffset() == 0 &&
@@ -808,11 +866,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
unsigned Offset;
if (MFI->LocalMemoryObjects.count(GV) == 0) {
- uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
- Offset = MFI->LDSSize;
+ unsigned Align = GV->getAlignment();
+ if (Align == 0)
+ Align = DL.getABITypeAlignment(GV->getValueType());
+
+ /// TODO: We should sort these to minimize wasted space due to alignment
+ /// padding. Currently the padding is decided by the first encountered use
+ /// during lowering.
+ Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align);
MFI->LocalMemoryObjects[GV] = Offset;
- // XXX: Account for alignment?
- MFI->LDSSize += Size;
+ MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType());
} else {
Offset = MFI->LocalMemoryObjects[GV];
}
@@ -820,50 +883,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
return DAG.getConstant(Offset, SDLoc(Op),
getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS));
}
- case AMDGPUAS::CONSTANT_ADDRESS: {
- MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
- Type *EltType = GV->getType()->getElementType();
- unsigned Size = DL.getTypeAllocSize(EltType);
- unsigned Alignment = DL.getPrefTypeAlignment(EltType);
-
- MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS);
- MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
-
- int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
- SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT);
-
- const GlobalVariable *Var = cast<GlobalVariable>(GV);
- if (!Var->hasInitializer()) {
- // This has no use, but bugpoint will hit it.
- return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
- }
-
- const Constant *Init = Var->getInitializer();
- SmallVector<SDNode*, 8> WorkList;
-
- for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(),
- E = DAG.getEntryNode()->use_end(); I != E; ++I) {
- if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD)
- continue;
- WorkList.push_back(*I);
- }
- SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG);
- for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(),
- E = WorkList.end(); I != E; ++I) {
- SmallVector<SDValue, 8> Ops;
- Ops.push_back(Chain);
- for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
- Ops.push_back((*I)->getOperand(i));
- }
- DAG.UpdateNodeOperands(*I, Ops);
- }
- return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT);
- }
}
const Function &Fn = *DAG.getMachineFunction().getFunction();
- DiagnosticInfoUnsupported BadInit(Fn,
- "initializer for address space");
+ DiagnosticInfoUnsupported BadInit(
+ Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
DAG.getContext()->diagnose(BadInit);
return SDValue();
}
@@ -875,7 +899,7 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
for (const SDUse &U : Op->ops())
DAG.ExtractVectorElements(U.get(), Args);
- return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
+ return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
}
SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
@@ -887,23 +911,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
- return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
-}
-
-SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
- SelectionDAG &DAG) const {
-
- MachineFunction &MF = DAG.getMachineFunction();
- const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering();
-
- FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
-
- unsigned FrameIndex = FIN->getIndex();
- unsigned IgnoredFrameReg;
- unsigned Offset =
- TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
- return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
- Op.getValueType());
+ return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
}
SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -914,121 +922,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
switch (IntrinsicID) {
default: return Op;
- case AMDGPUIntrinsic::AMDGPU_abs:
- case AMDGPUIntrinsic::AMDIL_abs: // Legacy name.
- return LowerIntrinsicIABS(Op, DAG);
- case AMDGPUIntrinsic::AMDGPU_lrp:
- return LowerIntrinsicLRP(Op, DAG);
-
- case AMDGPUIntrinsic::AMDGPU_clamp:
- case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name.
+ case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
- case Intrinsic::AMDGPU_div_scale: {
- // 3rd parameter required to be a constant.
- const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
- if (!Param)
- return DAG.getUNDEF(VT);
-
- // Translate to the operands expected by the machine instruction. The
- // first parameter must be the same as the first instruction.
- SDValue Numerator = Op.getOperand(1);
- SDValue Denominator = Op.getOperand(2);
-
- // Note this order is opposite of the machine instruction's operations,
- // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
- // intrinsic has the numerator as the first operand to match a normal
- // division operation.
-
- SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
-
- return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
- Denominator, Numerator);
- }
-
- case Intrinsic::AMDGPU_div_fmas:
- return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
- Op.getOperand(4));
-
- case Intrinsic::AMDGPU_div_fixup:
- return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
- case Intrinsic::AMDGPU_trig_preop:
- return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
- Op.getOperand(1), Op.getOperand(2));
-
- case Intrinsic::AMDGPU_rcp:
- return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
-
- case Intrinsic::AMDGPU_rsq:
- return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
-
- case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
- return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
-
- case Intrinsic::AMDGPU_rsq_clamped:
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- Type *Type = VT.getTypeForEVT(*DAG.getContext());
- APFloat Max = APFloat::getLargest(Type->getFltSemantics());
- APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
-
- SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
- SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
- DAG.getConstantFP(Max, DL, VT));
- return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
- DAG.getConstantFP(Min, DL, VT));
- } else {
- return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
- }
-
- case Intrinsic::AMDGPU_ldexp:
- return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1),
- Op.getOperand(2));
-
- case AMDGPUIntrinsic::AMDGPU_imax:
- return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1),
- Op.getOperand(2));
- case AMDGPUIntrinsic::AMDGPU_umax:
- return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1),
- Op.getOperand(2));
- case AMDGPUIntrinsic::AMDGPU_imin:
- return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1),
- Op.getOperand(2));
- case AMDGPUIntrinsic::AMDGPU_umin:
- return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1),
- Op.getOperand(2));
-
- case AMDGPUIntrinsic::AMDGPU_umul24:
- return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
- Op.getOperand(1), Op.getOperand(2));
-
- case AMDGPUIntrinsic::AMDGPU_imul24:
- return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
- Op.getOperand(1), Op.getOperand(2));
-
- case AMDGPUIntrinsic::AMDGPU_umad24:
- return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
- case AMDGPUIntrinsic::AMDGPU_imad24:
- return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
- Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
- case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
- return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
-
- case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
- return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
-
- case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
- return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
-
- case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
- return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
-
case AMDGPUIntrinsic::AMDGPU_bfe_i32:
return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
Op.getOperand(1),
@@ -1040,69 +937,13 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
-
- case AMDGPUIntrinsic::AMDGPU_bfi:
- return DAG.getNode(AMDGPUISD::BFI, DL, VT,
- Op.getOperand(1),
- Op.getOperand(2),
- Op.getOperand(3));
-
- case AMDGPUIntrinsic::AMDGPU_bfm:
- return DAG.getNode(AMDGPUISD::BFM, DL, VT,
- Op.getOperand(1),
- Op.getOperand(2));
-
- case Intrinsic::AMDGPU_class:
- return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
- Op.getOperand(1), Op.getOperand(2));
-
- case AMDGPUIntrinsic::AMDIL_exp: // Legacy name.
- return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
-
- case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name.
- return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
- case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name.
- return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1));
- case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name
- return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1));
}
}
-///IABS(a) = SMAX(sub(0, a), a)
-SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- Op.getOperand(1));
-
- return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1));
-}
-
-/// Linear Interpolation
-/// LRP(a, b, c) = muladd(a, b, (1 - a) * c)
-SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- // TODO: Should this propagate fast-math-flags?
- SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
- DAG.getConstantFP(1.0f, DL, MVT::f32),
- Op.getOperand(1));
- SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
- Op.getOperand(3));
- return DAG.getNode(ISD::FADD, DL, VT,
- DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)),
- OneSubAC);
-}
-
/// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
- EVT VT,
- SDValue LHS,
- SDValue RHS,
- SDValue True,
- SDValue False,
+SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+ SDValue LHS, SDValue RHS,
+ SDValue True, SDValue False,
SDValue CC,
DAGCombinerInfo &DCI) const {
if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -1176,56 +1017,48 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL,
return SDValue();
}
-SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op,
- SelectionDAG &DAG) const {
- LoadSDNode *Load = cast<LoadSDNode>(Op);
- EVT MemVT = Load->getMemoryVT();
- EVT MemEltVT = MemVT.getVectorElementType();
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
- EVT LoadVT = Op.getValueType();
- EVT EltVT = LoadVT.getVectorElementType();
- EVT PtrVT = Load->getBasePtr().getValueType();
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
- unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
- SmallVector<SDValue, 8> Loads;
- SmallVector<SDValue, 8> Chains;
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
- SDLoc SL(Op);
- unsigned MemEltSize = MemEltVT.getStoreSize();
- MachinePointerInfo SrcValue(Load->getMemOperand()->getValue());
+ SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
- DAG.getConstant(i * MemEltSize, SL, PtrVT));
+ return std::make_pair(Lo, Hi);
+}
- SDValue NewLoad
- = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
- Load->getChain(), Ptr,
- SrcValue.getWithOffset(i * MemEltSize),
- MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->isInvariant(), Load->getAlignment());
- Loads.push_back(NewLoad.getValue(0));
- Chains.push_back(NewLoad.getValue(1));
- }
+SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
- SDValue Ops[] = {
- DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads),
- DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains)
- };
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+ const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
+}
- return DAG.getMergeValues(Ops, SL);
+SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+
+ SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
+ const SDValue One = DAG.getConstant(1, SL, MVT::i32);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
}
SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SelectionDAG &DAG) const {
+ LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT VT = Op.getValueType();
+
// If this is a 2 element vector, we really want to scalarize and not create
// weird 1 element vectors.
if (VT.getVectorNumElements() == 2)
- return ScalarizeVectorLoad(Op, DAG);
+ return scalarizeVectorLoad(Load, DAG);
- LoadSDNode *Load = cast<LoadSDNode>(Op);
SDValue BasePtr = Load->getBasePtr();
EVT PtrVT = BasePtr.getValueType();
EVT MemVT = Load->getMemoryVT();
@@ -1245,22 +1078,15 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
unsigned BaseAlign = Load->getAlignment();
unsigned HiAlign = MinAlign(BaseAlign, Size);
- SDValue LoLoad
- = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
- Load->getChain(), BasePtr,
- SrcValue,
- LoMemVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->isInvariant(), BaseAlign);
-
+ SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
+ Load->getChain(), BasePtr, SrcValue, LoMemVT,
+ BaseAlign, Load->getMemOperand()->getFlags());
SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
DAG.getConstant(Size, SL, PtrVT));
-
- SDValue HiLoad
- = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT,
- Load->getChain(), HiPtr,
- SrcValue.getWithOffset(LoMemVT.getStoreSize()),
- HiMemVT, Load->isVolatile(), Load->isNonTemporal(),
- Load->isInvariant(), HiAlign);
+ SDValue HiLoad =
+ DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
+ HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
+ HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
SDValue Ops[] = {
DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
@@ -1271,6 +1097,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
return DAG.getMergeValues(Ops, SL);
}
+// FIXME: This isn't doing anything for SI. This should be used in a target
+// combine during type legalization.
SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
SelectionDAG &DAG) const {
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -1317,48 +1145,15 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
if (PackedSize < 32) {
EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
- Store->getMemOperand()->getPointerInfo(),
- PackedVT,
- Store->isNonTemporal(), Store->isVolatile(),
- Store->getAlignment());
+ Store->getMemOperand()->getPointerInfo(), PackedVT,
+ Store->getAlignment(),
+ Store->getMemOperand()->getFlags());
}
return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
Store->getMemOperand()->getPointerInfo(),
- Store->isVolatile(), Store->isNonTemporal(),
- Store->getAlignment());
-}
-
-SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op,
- SelectionDAG &DAG) const {
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
- EVT EltVT = Store->getValue().getValueType().getVectorElementType();
- EVT PtrVT = Store->getBasePtr().getValueType();
- unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
- SDLoc SL(Op);
-
- SmallVector<SDValue, 8> Chains;
-
- unsigned EltSize = MemEltVT.getStoreSize();
- MachinePointerInfo SrcValue(Store->getMemOperand()->getValue());
-
- for (unsigned i = 0, e = NumElts; i != e; ++i) {
- SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
- Store->getValue(),
- DAG.getConstant(i, SL, MVT::i32));
-
- SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT);
- SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset);
- SDValue NewStore =
- DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
- SrcValue.getWithOffset(i * EltSize),
- MemEltVT, Store->isNonTemporal(), Store->isVolatile(),
- Store->getAlignment());
- Chains.push_back(NewStore);
- }
-
- return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
+ Store->getAlignment(),
+ Store->getMemOperand()->getFlags());
}
SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
@@ -1370,7 +1165,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
// If this is a 2 element vector, we really want to scalarize and not create
// weird 1 element vectors.
if (VT.getVectorNumElements() == 2)
- return ScalarizeVectorStore(Op, DAG);
+ return scalarizeVectorStore(Store, DAG);
EVT MemVT = Store->getMemoryVT();
SDValue Chain = Store->getChain();
@@ -1395,171 +1190,21 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
unsigned Size = LoMemVT.getStoreSize();
unsigned HiAlign = MinAlign(BaseAlign, Size);
- SDValue LoStore
- = DAG.getTruncStore(Chain, SL, Lo,
- BasePtr,
- SrcValue,
- LoMemVT,
- Store->isNonTemporal(),
- Store->isVolatile(),
- BaseAlign);
- SDValue HiStore
- = DAG.getTruncStore(Chain, SL, Hi,
- HiPtr,
- SrcValue.getWithOffset(Size),
- HiMemVT,
- Store->isNonTemporal(),
- Store->isVolatile(),
- HiAlign);
+ SDValue LoStore =
+ DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
+ Store->getMemOperand()->getFlags());
+ SDValue HiStore =
+ DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
+ HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
}
-
-SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- LoadSDNode *Load = cast<LoadSDNode>(Op);
- ISD::LoadExtType ExtType = Load->getExtensionType();
- EVT VT = Op.getValueType();
- EVT MemVT = Load->getMemoryVT();
-
- if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
- assert(VT == MVT::i1 && "Only i1 non-extloads expected");
- // FIXME: Copied from PPC
- // First, load into 32 bits, then truncate to 1 bit.
-
- SDValue Chain = Load->getChain();
- SDValue BasePtr = Load->getBasePtr();
- MachineMemOperand *MMO = Load->getMemOperand();
-
- SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
- BasePtr, MVT::i8, MMO);
-
- SDValue Ops[] = {
- DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD),
- NewLD.getValue(1)
- };
-
- return DAG.getMergeValues(Ops, DL);
- }
-
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS ||
- Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS ||
- ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32))
- return SDValue();
-
- // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
- // register (2-)byte extract.
-
- // Get Register holding the target.
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
- DAG.getConstant(2, DL, MVT::i32));
- // Load the Register.
- SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
- Load->getChain(), Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32),
- Op.getOperand(2));
-
- // Get offset within the register.
- SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
- Load->getBasePtr(),
- DAG.getConstant(0x3, DL, MVT::i32));
-
- // Bit offset of target byte (byteIdx * 8).
- SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
- DAG.getConstant(3, DL, MVT::i32));
-
- // Shift to the right.
- Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
-
- // Eliminate the upper bits by setting them to ...
- EVT MemEltVT = MemVT.getScalarType();
-
- // ... ones.
- if (ExtType == ISD::SEXTLOAD) {
- SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
- SDValue Ops[] = {
- DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
- Load->getChain()
- };
-
- return DAG.getMergeValues(Ops, DL);
- }
-
- // ... or zeros.
- SDValue Ops[] = {
- DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
- Load->getChain()
- };
-
- return DAG.getMergeValues(Ops, DL);
-}
-
-SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
- if (Result.getNode()) {
- return Result;
- }
-
- StoreSDNode *Store = cast<StoreSDNode>(Op);
- SDValue Chain = Store->getChain();
- if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
- Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
- Store->getValue().getValueType().isVector()) {
- return SplitVectorStore(Op, DAG);
- }
-
- EVT MemVT = Store->getMemoryVT();
- if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
- MemVT.bitsLT(MVT::i32)) {
- unsigned Mask = 0;
- if (Store->getMemoryVT() == MVT::i8) {
- Mask = 0xff;
- } else if (Store->getMemoryVT() == MVT::i16) {
- Mask = 0xffff;
- }
- SDValue BasePtr = Store->getBasePtr();
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
- DAG.getConstant(2, DL, MVT::i32));
- SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Chain, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
-
- SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
- DAG.getConstant(0x3, DL, MVT::i32));
-
- SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
- DAG.getConstant(3, DL, MVT::i32));
-
- SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
- Store->getValue());
-
- SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
-
- SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
- MaskedValue, ShiftAmt);
-
- SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
- DAG.getConstant(Mask, DL, MVT::i32),
- ShiftAmt);
- DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
- DAG.getConstant(0xffffffff, DL, MVT::i32));
- Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
-
- SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
- return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
- Chain, Value, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
- }
- return SDValue();
-}
-
// This is a shortcut for integer division because we have fast i32<->f32
// conversions, and fast f32 reciprocal instructions. The fractional part of a
-// float is enough to accurately represent up to a 24-bit integer.
-SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
+// float is enough to accurately represent up to a 24-bit signed integer.
+SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
+ bool Sign) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
@@ -1567,20 +1212,26 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
MVT IntVT = MVT::i32;
MVT FltVT = MVT::f32;
- ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
- ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
+ unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
+ if (LHSSignBits < 9)
+ return SDValue();
- if (VT.isVector()) {
- unsigned NElts = VT.getVectorNumElements();
- IntVT = MVT::getVectorVT(MVT::i32, NElts);
- FltVT = MVT::getVectorVT(MVT::f32, NElts);
- }
+ unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
+ if (RHSSignBits < 9)
+ return SDValue();
+
+ unsigned BitSize = VT.getSizeInBits();
+ unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
+ unsigned DivBits = BitSize - SignBits;
+ if (Sign)
+ ++DivBits;
- unsigned BitSize = VT.getScalarType().getSizeInBits();
+ ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
+ ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
SDValue jq = DAG.getConstant(1, DL, IntVT);
- if (sign) {
+ if (Sign) {
// char|short jq = ia ^ ib;
jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
@@ -1590,18 +1241,13 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
// jq = jq | 0x1
jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
-
- // jq = (int)jq
- jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
}
// int ia = (int)LHS;
- SDValue ia = sign ?
- DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
+ SDValue ia = LHS;
// int ib, (int)RHS;
- SDValue ib = sign ?
- DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
+ SDValue ib = RHS;
// float fa = (float)ia;
SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
@@ -1609,8 +1255,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
// float fb = (float)ib;
SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
- // TODO: Should this propagate fast-math-flags?
- // float fq = native_divide(fa, fb);
SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
@@ -1621,8 +1265,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
// float fr = mad(fqneg, fb, fa);
- SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT,
- DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
+ SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
// int iq = (int)fq;
SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
@@ -1641,9 +1284,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
// jq = (cv ? jq : 0);
jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
- // dst = trunc/extend to legal type
- iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
-
// dst = iq + jq;
SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
@@ -1651,11 +1291,19 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool
SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
- SDValue Res[2] = {
- Div,
- Rem
- };
- return DAG.getMergeValues(Res, DL);
+ // Truncate to number of bits this divide really is.
+ if (Sign) {
+ SDValue InRegSize
+ = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
+ Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
+ Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
+ } else {
+ SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
+ Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
+ Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
+ }
+
+ return DAG.getMergeValues({ Div, Rem }, DL);
}
void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
@@ -1686,10 +1334,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
LHS_Lo, RHS_Lo);
- SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero);
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero);
- Results.push_back(DIV);
- Results.push_back(REM);
+ SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
+ SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
+
+ Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
+ Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
return;
}
@@ -1698,7 +1347,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
- SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero);
+ SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
+ REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
SDValue DIV_Lo = zero;
@@ -1718,7 +1368,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
// Add LHS high bit
REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
- SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT);
+ SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
@@ -1728,7 +1378,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
}
- SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+ SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
+ DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
Results.push_back(DIV);
Results.push_back(REM);
}
@@ -1744,19 +1395,14 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
return DAG.getMergeValues(Results, DL);
}
- SDValue Num = Op.getOperand(0);
- SDValue Den = Op.getOperand(1);
-
if (VT == MVT::i32) {
- if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) &&
- DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) {
- // TODO: We technically could do this for i64, but shouldn't that just be
- // handled by something generally reducing 64-bit division on 32-bit
- // values to 32-bit?
- return LowerDIVREM24(Op, DAG, false);
- }
+ if (SDValue Res = LowerDIVREM24(Op, DAG, false))
+ return Res;
}
+ SDValue Num = Op.getOperand(0);
+ SDValue Den = Op.getOperand(1);
+
// RCP = URECIP(Den) = 2^32 / Den + e
// e is rounding error.
SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -1864,11 +1510,11 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue NegOne = DAG.getConstant(-1, DL, VT);
- if (VT == MVT::i32 &&
- DAG.ComputeNumSignBits(LHS) > 8 &&
- DAG.ComputeNumSignBits(RHS) > 8) {
- return LowerDIVREM24(Op, DAG, true);
+ if (VT == MVT::i32) {
+ if (SDValue Res = LowerDIVREM24(Op, DAG, true))
+ return Res;
}
+
if (VT == MVT::i64 &&
DAG.ComputeNumSignBits(LHS) > 32 &&
DAG.ComputeNumSignBits(RHS) > 32) {
@@ -1954,7 +1600,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
-static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) {
+static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
+ SelectionDAG &DAG) {
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
@@ -1992,8 +1639,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
// Extend back to to 64-bits.
- SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
- Zero, SignBit);
+ SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
@@ -2391,7 +2037,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
MVT::i32, FloorMul);
SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
- SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi);
+ SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
}
@@ -2437,7 +2083,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
for (unsigned I = 0; I < NElts; ++I)
Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
- return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+ return DAG.getBuildVector(VT, DL, Args);
}
//===----------------------------------------------------------------------===//
@@ -2476,8 +2122,8 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
}
template <typename IntTy>
-static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
- uint32_t Offset, uint32_t Width, SDLoc DL) {
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
+ uint32_t Width, const SDLoc &DL) {
if (Width + Offset < 32) {
uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
@@ -2487,55 +2133,175 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
}
-static bool usesAllNormalStores(SDNode *LoadVal) {
- for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) {
- if (!ISD::isNormalStore(*I))
- return false;
+static bool hasVolatileUser(SDNode *Val) {
+ for (SDNode *U : Val->uses()) {
+ if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
+ if (M->isVolatile())
+ return true;
+ }
}
+ return false;
+}
+
+bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
+ // i32 vectors are the canonical memory type.
+ if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
+ return false;
+
+ if (!VT.isByteSized())
+ return false;
+
+ unsigned Size = VT.getStoreSize();
+
+ if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
+ return false;
+
+ if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
+ return false;
+
return true;
}
-// If we have a copy of an illegal type, replace it with a load / store of an
-// equivalently sized legal type. This avoids intermediate bit pack / unpack
-// instructions emitted when handling extloads and truncstores. Ideally we could
-// recognize the pack / unpack pattern to eliminate it.
+// Replace load of an illegal type with a store of a bitcast to a friendlier
+// type.
+SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ LoadSDNode *LN = cast<LoadSDNode>(N);
+ if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+ return SDValue();
+
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = LN->getMemoryVT();
+
+ unsigned Size = VT.getStoreSize();
+ unsigned Align = LN->getAlignment();
+ if (Align < Size && isTypeLegal(VT)) {
+ bool IsFast;
+ unsigned AS = LN->getAddressSpace();
+
+ // Expand unaligned loads earlier than legalization. Due to visitation order
+ // problems during legalization, the emitted instructions to pack and unpack
+ // the bytes again are not eliminated in the case of an unaligned copy.
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+ return DAG.getMergeValues(Ops, SDLoc(N));
+ }
+
+ if (!IsFast)
+ return SDValue();
+ }
+
+ if (!shouldCombineMemoryType(VT))
+ return SDValue();
+
+ EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+
+ SDValue NewLoad
+ = DAG.getLoad(NewVT, SL, LN->getChain(),
+ LN->getBasePtr(), LN->getMemOperand());
+
+ SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
+ DCI.CombineTo(N, BC, NewLoad.getValue(1));
+ return SDValue(N, 0);
+}
+
+// Replace store of an illegal type with a store of a bitcast to a friendlier
+// type.
SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (!DCI.isBeforeLegalize())
return SDValue();
StoreSDNode *SN = cast<StoreSDNode>(N);
- SDValue Value = SN->getValue();
- EVT VT = Value.getValueType();
+ if (SN->isVolatile() || !ISD::isNormalStore(SN))
+ return SDValue();
- if (isTypeLegal(VT) || SN->isVolatile() ||
- !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8)
+ EVT VT = SN->getMemoryVT();
+ unsigned Size = VT.getStoreSize();
+
+ SDLoc SL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned Align = SN->getAlignment();
+ if (Align < Size && isTypeLegal(VT)) {
+ bool IsFast;
+ unsigned AS = SN->getAddressSpace();
+
+ // Expand unaligned stores earlier than legalization. Due to visitation
+ // order problems during legalization, the emitted instructions to pack and
+ // unpack the bytes again are not eliminated in the case of an unaligned
+ // copy.
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast))
+ return expandUnalignedStore(SN, DAG);
+
+ if (!IsFast)
+ return SDValue();
+ }
+
+ if (!shouldCombineMemoryType(VT))
+ return SDValue();
+
+ EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
+ SDValue Val = SN->getValue();
+
+ //DCI.AddToWorklist(Val.getNode());
+
+ bool OtherUses = !Val.hasOneUse();
+ SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
+ if (OtherUses) {
+ SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
+ DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
+ }
+
+ return DAG.getStore(SN->getChain(), SL, CastVal,
+ SN->getBasePtr(), SN->getMemOperand());
+}
+
+// TODO: Should repeat for other bit ops.
+SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (N->getValueType(0) != MVT::i64)
return SDValue();
- LoadSDNode *LoadVal = cast<LoadSDNode>(Value);
- if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal))
+ // Break up 64-bit and of a constant into two 32-bit ands. This will typically
+ // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer
+ // combine opportunities since most 64-bit operations are decomposed this way.
+ // TODO: We won't want this for SALU especially if it is an inline immediate.
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS)
return SDValue();
- EVT MemVT = LoadVal->getMemoryVT();
+ uint64_t Val = RHS->getZExtValue();
+ if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) {
+ // If either half of the constant is 0, this is really a 32-bit and, so
+ // split it. If we can re-use the full materialized constant, keep it.
+ return SDValue();
+ }
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
- EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT);
- SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
- LoadVT, SL,
- LoadVal->getChain(),
- LoadVal->getBasePtr(),
- LoadVal->getOffset(),
- LoadVT,
- LoadVal->getMemOperand());
+ SDValue Lo, Hi;
+ std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG);
- SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0));
- DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false);
+ SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32);
+ SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
- return DAG.getStore(SN->getChain(), SL, NewLoad,
- SN->getBasePtr(), SN->getMemOperand());
+ SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS);
+ SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS);
+
+ // Re-visit the ands. It's possible we eliminated one of them and it could
+ // simplify the vector.
+ DCI.AddToWorklist(Lo.getNode());
+ DCI.AddToWorklist(Hi.getNode());
+
+ SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
}
SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
@@ -2543,14 +2309,17 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
if (N->getValueType(0) != MVT::i64)
return SDValue();
- // i64 (shl x, 32) -> (build_pair 0, x)
+ // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
- // Doing this with moves theoretically helps MI optimizations that understand
- // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
- // v_lshl_b64. In the SALU case, I think this is slightly worse since it
- // doubles the code size and I'm unsure about cycle count.
+ // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+ // common case, splitting this into a move and a 32-bit shift is faster and
+ // the same code size.
const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHS || RHS->getZExtValue() != 32)
+ if (!RHS)
+ return SDValue();
+
+ unsigned RHSVal = RHS->getZExtValue();
+ if (RHSVal < 32)
return SDValue();
SDValue LHS = N->getOperand(0);
@@ -2558,11 +2327,85 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
- // Extract low 32-bits.
+ SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
+
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
+ SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
- return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
+
+ SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+}
+
+SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ unsigned RHSVal = RHS->getZExtValue();
+
+ // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
+ if (RHSVal == 32) {
+ SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+ SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+ DAG.getConstant(31, SL, MVT::i32));
+
+ SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+ }
+
+ // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
+ if (RHSVal == 63) {
+ SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
+ SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
+ DAG.getConstant(31, SL, MVT::i32));
+ SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
+ }
+
+ return SDValue();
+}
+
+SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ if (N->getValueType(0) != MVT::i64)
+ return SDValue();
+
+ const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHS)
+ return SDValue();
+
+ unsigned ShiftAmt = RHS->getZExtValue();
+ if (ShiftAmt < 32)
+ return SDValue();
+
+ // srl i64:x, C for C >= 32
+ // =>
+ // build_pair (srl hi_32(x), C - 32), 0
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ SDValue One = DAG.getConstant(1, SL, MVT::i32);
+ SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
+
+ SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
+ SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
+ VecOp, One);
+
+ SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
+ SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
+
+ SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
+
+ return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
@@ -2610,8 +2453,8 @@ static bool isCtlzOpc(unsigned Opc) {
// type VT.
// Need to match pre-legalized type because the generic legalization inserts the
// add/sub between the select and compare.
-static SDValue getFFBH_U32(const TargetLowering &TLI,
- SelectionDAG &DAG, SDLoc SL, SDValue Op) {
+static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG,
+ const SDLoc &SL, SDValue Op) {
EVT VT = Op.getValueType();
EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
if (LegalVT != MVT::i32)
@@ -2634,10 +2477,8 @@ static SDValue getFFBH_U32(const TargetLowering &TLI,
// against the bitwidth.
//
// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
-SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL,
- SDValue Cond,
- SDValue LHS,
- SDValue RHS,
+SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
+ SDValue LHS, SDValue RHS,
DAGCombinerInfo &DCI) const {
ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
if (!CmpRhs || !CmpRhs->isNullValue())
@@ -2680,8 +2521,13 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
SDValue True = N->getOperand(1);
SDValue False = N->getOperand(2);
- if (VT == MVT::f32 && Cond.hasOneUse())
- return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+ if (VT == MVT::f32 && Cond.hasOneUse()) {
+ SDValue MinMax
+ = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+ // Revisit this node so we can catch min3/max3/med3 patterns.
+ //DCI.AddToWorklist(MinMax.getNode());
+ return MinMax;
+ }
// There's no reason to not do this if the condition has other uses.
return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
@@ -2695,12 +2541,62 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
switch(N->getOpcode()) {
default:
break;
+ case ISD::BITCAST: {
+ EVT DestVT = N->getValueType(0);
+ if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
+ break;
+
+ // Fold bitcasts of constants.
+ //
+ // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
+ // TODO: Generalize and move to DAGCombiner
+ SDValue Src = N->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
+ assert(Src.getValueType() == MVT::i64);
+ SDLoc SL(N);
+ uint64_t CVal = C->getZExtValue();
+ return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+ }
+
+ if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
+ const APInt &Val = C->getValueAPF().bitcastToAPInt();
+ SDLoc SL(N);
+ uint64_t CVal = Val.getZExtValue();
+ SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
+ DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
+ DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
+
+ return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
+ }
+
+ break;
+ }
case ISD::SHL: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;
return performShlCombine(N, DCI);
}
+ case ISD::SRL: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ return performSrlCombine(N, DCI);
+ }
+ case ISD::SRA: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ return performSraCombine(N, DCI);
+ }
+ case ISD::AND: {
+ if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+ break;
+
+ return performAndCombine(N, DCI);
+ }
case ISD::MUL:
return performMulCombine(N, DCI);
case AMDGPUISD::MUL_I24:
@@ -2797,7 +2693,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
break;
}
-
+ case ISD::LOAD:
+ return performLoadCombine(N, DCI);
case ISD::STORE:
return performStoreCombine(N, DCI);
}
@@ -2840,20 +2737,6 @@ void AMDGPUTargetLowering::getOriginalFunctionArgs(
}
}
-bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
- if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
- return CFP->isExactlyValue(1.0);
- }
- return isAllOnesConstant(Op);
-}
-
-bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
- if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
- return CFP->getValueAPF().isZero();
- }
- return isNullConstant(Op);
-}
-
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
unsigned Reg, EVT VT) const {
@@ -2889,10 +2772,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
// AMDIL DAG nodes
NODE_NAME_CASE(CALL);
NODE_NAME_CASE(UMUL);
- NODE_NAME_CASE(RET_FLAG);
NODE_NAME_CASE(BRANCH_COND);
// AMDGPU DAG nodes
+ NODE_NAME_CASE(ENDPGM)
+ NODE_NAME_CASE(RETURN)
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(CLAMP)
@@ -2906,6 +2790,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMIN3)
NODE_NAME_CASE(SMIN3)
NODE_NAME_CASE(UMIN3)
+ NODE_NAME_CASE(FMED3)
+ NODE_NAME_CASE(SMED3)
+ NODE_NAME_CASE(UMED3)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
NODE_NAME_CASE(DIV_FMAS)
@@ -2914,7 +2801,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RCP)
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RSQ_LEGACY)
- NODE_NAME_CASE(RSQ_CLAMPED)
+ NODE_NAME_CASE(RSQ_CLAMP)
NODE_NAME_CASE(LDEXP)
NODE_NAME_CASE(FP_CLASS)
NODE_NAME_CASE(DOT4)
@@ -2934,7 +2821,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)
NODE_NAME_CASE(REGISTER_STORE)
- NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(LOAD_INPUT)
NODE_NAME_CASE(SAMPLE)
NODE_NAME_CASE(SAMPLEB)
@@ -2946,13 +2832,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_F32_UBYTE3)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
+ NODE_NAME_CASE(PC_ADD_REL_OFFSET)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(SENDMSG)
NODE_NAME_CASE(INTERP_MOV)
NODE_NAME_CASE(INTERP_P1)
NODE_NAME_CASE(INTERP_P2)
NODE_NAME_CASE(STORE_MSKOR)
+ NODE_NAME_CASE(LOAD_CONSTANT)
NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+ NODE_NAME_CASE(ATOMIC_CMP_SWAP)
+ NODE_NAME_CASE(ATOMIC_INC)
+ NODE_NAME_CASE(ATOMIC_DEC)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
return nullptr;
@@ -2998,21 +2889,6 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
return SDValue();
}
-static void computeKnownBitsForMinMax(const SDValue Op0,
- const SDValue Op1,
- APInt &KnownZero,
- APInt &KnownOne,
- const SelectionDAG &DAG,
- unsigned Depth) {
- APInt Op0Zero, Op0One;
- APInt Op1Zero, Op1One;
- DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
- DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
-
- KnownZero = Op0Zero & Op1Zero;
- KnownOne = Op0One & Op1One;
-}
-
void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
const SDValue Op,
APInt &KnownZero,
@@ -3029,22 +2905,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
switch (Opc) {
default:
break;
- case ISD::INTRINSIC_WO_CHAIN: {
- // FIXME: The intrinsic should just use the node.
- switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
- case AMDGPUIntrinsic::AMDGPU_imax:
- case AMDGPUIntrinsic::AMDGPU_umax:
- case AMDGPUIntrinsic::AMDGPU_imin:
- case AMDGPUIntrinsic::AMDGPU_umin:
- computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
- KnownZero, KnownOne, DAG, Depth);
- break;
- default:
- break;
- }
-
- break;
- }
case AMDGPUISD::CARRY:
case AMDGPUISD::BORROW: {
KnownZero = APInt::getHighBitsSet(32, 31);