aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-06-13 19:31:46 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-07-31 18:56:55 +0000
commitaf732203b8f7f006927528db5497f5cbc4c4742a (patch)
tree596f112de3b76118552871dbb6114bb7e3e17f40 /contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
parent83dea422ac8d4a8323e64203c2eadaa813768717 (diff)
downloadsrc-af732203b8f7f006927528db5497f5cbc4c4742a.tar.gz
src-af732203b8f7f006927528db5497f5cbc4c4742a.zip
Merge llvm-project 12.0.1 release and follow-up fixes
Merge llvm-project main llvmorg-12-init-17869-g8e464dd76bef This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12-init-17869-g8e464dd76bef, the last commit before the upstream release/12.x branch was created. PR: 255570 (cherry picked from commit e8d8bef961a50d4dc22501cde4fb9fb0be1b2532) Merge llvm-project 12.0.0 release This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12.0.0-0-gd28af7c654d8, a.k.a. 12.0.0 release. PR: 255570 (cherry picked from commit d409305fa3838fb39b38c26fc085fb729b8766d5) Disable strict-fp for powerpcspe, as it does not work properly yet Merge commit 5c18d1136665 from llvm git (by Qiu Chaofan) [SPE] Disable strict-fp for SPE by default As discussed in PR50385, strict-fp on PowerPC SPE has not been handled well. This patch disables it by default for SPE. Reviewed By: nemanjai, vit9696, jhibbits Differential Revision: https://reviews.llvm.org/D103235 PR: 255570 (cherry picked from commit 715df83abc049b23d9acddc81f2480bd4c056d64) Apply upstream libc++ fix to allow building with devel/xxx-xtoolchain-gcc Merge commit 52e9d80d5db2 from llvm git (by Jason Liu): [libc++] add `inline` for __open's definition in ifstream and ofstream Summary: When building with gcc on AIX, it seems that gcc does not like the `always_inline` without the `inline` keyword. So adding the inline keywords in for __open in ifstream and ofstream. That will also make it consistent with __open in basic_filebuf (it seems we added `inline` there before for gcc build as well). Differential Revision: https://reviews.llvm.org/D99422 PR: 255570 (cherry picked from commit d099db25464b826c5724cf2fb5b22292bbe15f6e) Undefine HAVE_(DE)REGISTER_FRAME in llvm's config.h on arm Otherwise, the lli tool (enable by WITH_CLANG_EXTRAS) won't link on arm, stating that __register_frame is undefined. This function is normally provided by libunwind, but explicitly not for the ARM Exception ABI. Reported by: oh PR: 255570 (cherry picked from commit f336b45e943c7f9a90ffcea1a6c4c7039e54c73c) Merge llvm-project 12.0.1 rc2 This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12.0.1-rc2-0-ge7dac564cd0e, a.k.a. 12.0.1 rc2. PR: 255570 (cherry picked from commit 23408297fbf3089f0388a8873b02fa75ab3f5bb9) Revert libunwind change to fix backtrace segfault on aarch64 Revert commit 22b615a96593 from llvm git (by Daniel Kiss): [libunwind] Support for leaf function unwinding. Unwinding leaf function is useful in cases when the backtrace finds a leaf function for example when it caused a signal. This patch also add the support for the DW_CFA_undefined because it marks the end of the frames. Ryan Prichard provided code for the tests. Reviewed By: #libunwind, mstorsjo Differential Revision: https://reviews.llvm.org/D83573 Reland with limit the test to the x86_64-linux target. Bisection has shown that this particular upstream commit causes programs using backtrace(3) on aarch64 to segfault. This affects the lang/rust port, for instance. Until we can upstream to fix this problem, revert the commit for now. Reported by: mikael PR: 256864 (cherry picked from commit 5866c369e4fd917c0d456f0f10b92ee354b82279) Merge llvm-project 12.0.1 release This updates llvm, clang, compiler-rt, libc++, libunwind, lld, lldb and openmp to llvmorg-12.0.1-0-gfed41342a82f, a.k.a. 12.0.1 release. PR: 255570 (cherry picked from commit 4652422eb477731f284b1345afeefef7f269da50) compilert-rt: build out-of-line LSE atomics helpers for aarch64 Both clang >= 12 and gcc >= 10.1 now default to -moutline-atomics for aarch64. This requires a bunch of helper functions in libcompiler_rt.a, to avoid link errors like "undefined symbol: __aarch64_ldadd8_acq_rel". (Note: of course you can use -mno-outline-atomics as a workaround too, but this would negate the potential performance benefit of the faster LSE instructions.) Bump __FreeBSD_version so ports maintainers can easily detect this. PR: 257392 (cherry picked from commit cc55ee8009a550810d38777fd6ace9abf3a2f6b4)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp1075
1 files changed, 1075 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
new file mode 100644
index 000000000000..06aa0055e4bb
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -0,0 +1,1075 @@
+//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUTargetTransformInfo.h"
+#include "GCNSubtarget.h"
+#include "R600Subtarget.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Transforms/InstCombine/InstCombiner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "AMDGPUtti"
+
+namespace {
+
+struct AMDGPUImageDMaskIntrinsic {
+ unsigned Intr;
+};
+
+#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
+#include "InstCombineTables.inc"
+
+} // end anonymous namespace
+
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+ const APFloat &Src2) {
+ APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+ APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+ assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+ if (Cmp0 == APFloat::cmpEqual)
+ return maxnum(Src1, Src2);
+
+ APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+ assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+ if (Cmp1 == APFloat::cmpEqual)
+ return maxnum(Src0, Src2);
+
+ return maxnum(Src0, Src1);
+}
+
+// Check if a value can be converted to a 16-bit value without losing
+// precision.
+static bool canSafelyConvertTo16Bit(Value &V) {
+ Type *VTy = V.getType();
+ if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
+ // The value is already 16-bit, so we don't want to convert to 16-bit again!
+ return false;
+ }
+ if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
+ // We need to check that if we cast the index down to a half, we do not lose
+ // precision.
+ APFloat FloatValue(ConstFloat->getValueAPF());
+ bool LosesInfo = true;
+ FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
+ return !LosesInfo;
+ }
+ Value *CastSrc;
+ if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
+ match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
+ match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
+ Type *CastSrcTy = CastSrc->getType();
+ if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
+ return true;
+ }
+
+ return false;
+}
+
+// Convert a value to 16-bit.
+static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
+ Type *VTy = V.getType();
+ if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
+ return cast<Instruction>(&V)->getOperand(0);
+ if (VTy->isIntegerTy())
+ return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
+ if (VTy->isFloatingPointTy())
+ return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
+
+ llvm_unreachable("Should never be called!");
+}
+
+static Optional<Instruction *>
+simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
+ IntrinsicInst &II, InstCombiner &IC) {
+ if (!ST->hasA16() && !ST->hasG16())
+ return None;
+
+ bool FloatCoord = false;
+ // true means derivatives can be converted to 16 bit, coordinates not
+ bool OnlyDerivatives = false;
+
+ for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+ OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
+ Value *Coord = II.getOperand(OperandIndex);
+ // If the values are not derived from 16-bit values, we cannot optimize.
+ if (!canSafelyConvertTo16Bit(*Coord)) {
+ if (OperandIndex < ImageDimIntr->CoordStart ||
+ ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
+ return None;
+ }
+ // All gradients can be converted, so convert only them
+ OnlyDerivatives = true;
+ break;
+ }
+
+ assert(OperandIndex == ImageDimIntr->GradientStart ||
+ FloatCoord == Coord->getType()->isFloatingPointTy());
+ FloatCoord = Coord->getType()->isFloatingPointTy();
+ }
+
+ if (OnlyDerivatives) {
+ if (!ST->hasG16())
+ return None;
+ } else {
+ if (!ST->hasA16())
+ OnlyDerivatives = true; // Only supports G16
+ }
+
+ Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
+ : Type::getInt16Ty(II.getContext());
+
+ SmallVector<Type *, 4> ArgTys;
+ if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+ return None;
+
+ ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
+ if (!OnlyDerivatives)
+ ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
+ Function *I =
+ Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
+
+ SmallVector<Value *, 8> Args(II.arg_operands());
+
+ unsigned EndIndex =
+ OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
+ for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+ OperandIndex < EndIndex; OperandIndex++) {
+ Args[OperandIndex] =
+ convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
+ }
+
+ CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+ NewCall->takeName(&II);
+ NewCall->copyMetadata(II);
+ if (isa<FPMathOperator>(NewCall))
+ NewCall->copyFastMathFlags(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+}
+
+bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
+ InstCombiner &IC) const {
+ // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+ // infinity, gives +0.0. If we can prove we don't have one of the special
+ // cases then we can use a normal multiply instead.
+ // TODO: Create and use isKnownFiniteNonZero instead of just matching
+ // constants here.
+ if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
+ match(Op1, PatternMatch::m_FiniteNonZero())) {
+ // One operand is not zero or infinity or NaN.
+ return true;
+ }
+ auto *TLI = &IC.getTargetLibraryInfo();
+ if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
+ isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
+ // Neither operand is infinity or NaN.
+ return true;
+ }
+ return false;
+}
+
+Optional<Instruction *>
+GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
+ Intrinsic::ID IID = II.getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::amdgcn_rcp: {
+ Value *Src = II.getArgOperand(0);
+
+ // TODO: Move to ConstantFolding/InstSimplify?
+ if (isa<UndefValue>(Src)) {
+ Type *Ty = II.getType();
+ auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+ return IC.replaceInstUsesWith(II, QNaN);
+ }
+
+ if (II.isStrictFP())
+ break;
+
+ if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+ const APFloat &ArgVal = C->getValueAPF();
+ APFloat Val(ArgVal.getSemantics(), 1);
+ Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
+
+ // This is more precise than the instruction may give.
+ //
+ // TODO: The instruction always flushes denormal results (except for f16),
+ // should this also?
+ return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_rsq: {
+ Value *Src = II.getArgOperand(0);
+
+ // TODO: Move to ConstantFolding/InstSimplify?
+ if (isa<UndefValue>(Src)) {
+ Type *Ty = II.getType();
+ auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+ return IC.replaceInstUsesWith(II, QNaN);
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_frexp_mant:
+ case Intrinsic::amdgcn_frexp_exp: {
+ Value *Src = II.getArgOperand(0);
+ if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+ int Exp;
+ APFloat Significand =
+ frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
+
+ if (IID == Intrinsic::amdgcn_frexp_mant) {
+ return IC.replaceInstUsesWith(
+ II, ConstantFP::get(II.getContext(), Significand));
+ }
+
+ // Match instruction special case behavior.
+ if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
+ Exp = 0;
+
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
+ }
+
+ if (isa<UndefValue>(Src)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_class: {
+ enum {
+ S_NAN = 1 << 0, // Signaling NaN
+ Q_NAN = 1 << 1, // Quiet NaN
+ N_INFINITY = 1 << 2, // Negative infinity
+ N_NORMAL = 1 << 3, // Negative normal
+ N_SUBNORMAL = 1 << 4, // Negative subnormal
+ N_ZERO = 1 << 5, // Negative zero
+ P_ZERO = 1 << 6, // Positive zero
+ P_SUBNORMAL = 1 << 7, // Positive subnormal
+ P_NORMAL = 1 << 8, // Positive normal
+ P_INFINITY = 1 << 9 // Positive infinity
+ };
+
+ const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
+ N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
+ P_NORMAL | P_INFINITY;
+
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+ const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
+ if (!CMask) {
+ if (isa<UndefValue>(Src0)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ if (isa<UndefValue>(Src1)) {
+ return IC.replaceInstUsesWith(II,
+ ConstantInt::get(II.getType(), false));
+ }
+ break;
+ }
+
+ uint32_t Mask = CMask->getZExtValue();
+
+ // If all tests are made, it doesn't matter what the value is.
+ if ((Mask & FullMask) == FullMask) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
+ }
+
+ if ((Mask & FullMask) == 0) {
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
+ }
+
+ if (Mask == (S_NAN | Q_NAN)) {
+ // Equivalent of isnan. Replace with standard fcmp.
+ Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
+ FCmp->takeName(&II);
+ return IC.replaceInstUsesWith(II, FCmp);
+ }
+
+ if (Mask == (N_ZERO | P_ZERO)) {
+ // Equivalent of == 0.
+ Value *FCmp =
+ IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
+
+ FCmp->takeName(&II);
+ return IC.replaceInstUsesWith(II, FCmp);
+ }
+
+ // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
+ if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
+ isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
+ return IC.replaceOperand(
+ II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
+ }
+
+ const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
+ if (!CVal) {
+ if (isa<UndefValue>(Src0)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ // Clamp mask to used bits
+ if ((Mask & FullMask) != Mask) {
+ CallInst *NewCall = IC.Builder.CreateCall(
+ II.getCalledFunction(),
+ {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
+
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ break;
+ }
+
+ const APFloat &Val = CVal->getValueAPF();
+
+ bool Result =
+ ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
+ ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
+ ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
+ ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
+ ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
+ ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
+ ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
+ ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
+ ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
+ ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
+
+ return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
+ }
+ case Intrinsic::amdgcn_cvt_pkrtz: {
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+ if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+ if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+ const fltSemantics &HalfSem =
+ II.getType()->getScalarType()->getFltSemantics();
+ bool LosesInfo;
+ APFloat Val0 = C0->getValueAPF();
+ APFloat Val1 = C1->getValueAPF();
+ Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+ Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+ Constant *Folded =
+ ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
+ ConstantFP::get(II.getContext(), Val1)});
+ return IC.replaceInstUsesWith(II, Folded);
+ }
+ }
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
+ return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_ubfe:
+ case Intrinsic::amdgcn_sbfe: {
+ // Decompose simple cases into standard shifts.
+ Value *Src = II.getArgOperand(0);
+ if (isa<UndefValue>(Src)) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+
+ unsigned Width;
+ Type *Ty = II.getType();
+ unsigned IntSize = Ty->getIntegerBitWidth();
+
+ ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ if (CWidth) {
+ Width = CWidth->getZExtValue();
+ if ((Width & (IntSize - 1)) == 0) {
+ return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
+ }
+
+ // Hardware ignores high bits, so remove those.
+ if (Width >= IntSize) {
+ return IC.replaceOperand(
+ II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
+ }
+ }
+
+ unsigned Offset;
+ ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ if (COffset) {
+ Offset = COffset->getZExtValue();
+ if (Offset >= IntSize) {
+ return IC.replaceOperand(
+ II, 1,
+ ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
+ }
+ }
+
+ bool Signed = IID == Intrinsic::amdgcn_sbfe;
+
+ if (!CWidth || !COffset)
+ break;
+
+ // The case of Width == 0 is handled above, which makes this tranformation
+ // safe. If Width == 0, then the ashr and lshr instructions become poison
+ // value since the shift amount would be equal to the bit size.
+ assert(Width != 0);
+
+ // TODO: This allows folding to undef when the hardware has specific
+ // behavior?
+ if (Offset + Width < IntSize) {
+ Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
+ Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
+ : IC.Builder.CreateLShr(Shl, IntSize - Width);
+ RightShift->takeName(&II);
+ return IC.replaceInstUsesWith(II, RightShift);
+ }
+
+ Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
+ : IC.Builder.CreateLShr(Src, Offset);
+
+ RightShift->takeName(&II);
+ return IC.replaceInstUsesWith(II, RightShift);
+ }
+ case Intrinsic::amdgcn_exp:
+ case Intrinsic::amdgcn_exp_compr: {
+ ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
+ unsigned EnBits = En->getZExtValue();
+ if (EnBits == 0xf)
+ break; // All inputs enabled.
+
+ bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
+ bool Changed = false;
+ for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+ if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+ (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+ Value *Src = II.getArgOperand(I + 2);
+ if (!isa<UndefValue>(Src)) {
+ IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
+ Changed = true;
+ }
+ }
+ }
+
+ if (Changed) {
+ return &II;
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_fmed3: {
+ // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+ // for the shader.
+
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+ Value *Src2 = II.getArgOperand(2);
+
+ // Checking for NaN before canonicalization provides better fidelity when
+ // mapping other operations onto fmed3 since the order of operands is
+ // unchanged.
+ CallInst *NewCall = nullptr;
+ if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
+ NewCall = IC.Builder.CreateMinNum(Src1, Src2);
+ } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
+ NewCall = IC.Builder.CreateMinNum(Src0, Src2);
+ } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
+ NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
+ }
+
+ if (NewCall) {
+ NewCall->copyFastMathFlags(&II);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ bool Swap = false;
+ // Canonicalize constants to RHS operands.
+ //
+ // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+ if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+ std::swap(Src0, Src1);
+ Swap = true;
+ }
+
+ if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+ std::swap(Src1, Src2);
+ Swap = true;
+ }
+
+ if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+ std::swap(Src0, Src1);
+ Swap = true;
+ }
+
+ if (Swap) {
+ II.setArgOperand(0, Src0);
+ II.setArgOperand(1, Src1);
+ II.setArgOperand(2, Src2);
+ return &II;
+ }
+
+ if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+ if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+ if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+ APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+ C2->getValueAPF());
+ return IC.replaceInstUsesWith(
+ II, ConstantFP::get(IC.Builder.getContext(), Result));
+ }
+ }
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_icmp:
+ case Intrinsic::amdgcn_fcmp: {
+ const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
+ // Guard against invalid arguments.
+ int64_t CCVal = CC->getZExtValue();
+ bool IsInteger = IID == Intrinsic::amdgcn_icmp;
+ if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+ CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+ (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+ CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+ break;
+
+ Value *Src0 = II.getArgOperand(0);
+ Value *Src1 = II.getArgOperand(1);
+
+ if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+ if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+ Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+ if (CCmp->isNullValue()) {
+ return IC.replaceInstUsesWith(
+ II, ConstantExpr::getSExt(CCmp, II.getType()));
+ }
+
+ // The result of V_ICMP/V_FCMP assembly instructions (which this
+ // intrinsic exposes) is one bit per thread, masked with the EXEC
+ // register (which contains the bitmask of live threads). So a
+ // comparison that always returns true is the same as a read of the
+ // EXEC register.
+ Function *NewF = Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::read_register, II.getType());
+ Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
+ MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+ Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+ CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+ NewCall->addAttribute(AttributeList::FunctionIndex,
+ Attribute::Convergent);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ // Canonicalize constants to RHS.
+ CmpInst::Predicate SwapPred =
+ CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+ II.setArgOperand(0, Src1);
+ II.setArgOperand(1, Src0);
+ II.setArgOperand(
+ 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
+ return &II;
+ }
+
+ if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+ break;
+
+ // Canonicalize compare eq with true value to compare != 0
+ // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+ // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+ // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+ // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+ Value *ExtSrc;
+ if (CCVal == CmpInst::ICMP_EQ &&
+ ((match(Src1, PatternMatch::m_One()) &&
+ match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
+ (match(Src1, PatternMatch::m_AllOnes()) &&
+ match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
+ ExtSrc->getType()->isIntegerTy(1)) {
+ IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
+ IC.replaceOperand(II, 2,
+ ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+ return &II;
+ }
+
+ CmpInst::Predicate SrcPred;
+ Value *SrcLHS;
+ Value *SrcRHS;
+
+ // Fold compare eq/ne with 0 from a compare result as the predicate to the
+ // intrinsic. The typical use is a wave vote function in the library, which
+ // will be fed from a user code condition compared with 0. Fold in the
+ // redundant compare.
+
+ // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+ // -> llvm.amdgcn.[if]cmp(a, b, pred)
+ //
+ // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+ // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+ if (match(Src1, PatternMatch::m_Zero()) &&
+ match(Src0, PatternMatch::m_ZExtOrSExt(
+ m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
+ PatternMatch::m_Value(SrcRHS))))) {
+ if (CCVal == CmpInst::ICMP_EQ)
+ SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+ Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
+ ? Intrinsic::amdgcn_fcmp
+ : Intrinsic::amdgcn_icmp;
+
+ Type *Ty = SrcLHS->getType();
+ if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
+ // Promote to next legal integer type.
+ unsigned Width = CmpType->getBitWidth();
+ unsigned NewWidth = Width;
+
+ // Don't do anything for i1 comparisons.
+ if (Width == 1)
+ break;
+
+ if (Width <= 16)
+ NewWidth = 16;
+ else if (Width <= 32)
+ NewWidth = 32;
+ else if (Width <= 64)
+ NewWidth = 64;
+ else if (Width > 64)
+ break; // Can't handle this.
+
+ if (Width != NewWidth) {
+ IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
+ if (CmpInst::isSigned(SrcPred)) {
+ SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
+ SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
+ } else {
+ SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
+ SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
+ }
+ }
+ } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
+ break;
+
+ Function *NewF = Intrinsic::getDeclaration(
+ II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
+ Value *Args[] = {SrcLHS, SrcRHS,
+ ConstantInt::get(CC->getType(), SrcPred)};
+ CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_ballot: {
+ if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
+ if (Src->isZero()) {
+ // amdgcn.ballot(i1 0) is zero.
+ return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
+ }
+
+ if (Src->isOne()) {
+ // amdgcn.ballot(i1 1) is exec.
+ const char *RegName = "exec";
+ if (II.getType()->isIntegerTy(32))
+ RegName = "exec_lo";
+ else if (!II.getType()->isIntegerTy(64))
+ break;
+
+ Function *NewF = Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::read_register, II.getType());
+ Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
+ MDNode *MD = MDNode::get(II.getContext(), MDArgs);
+ Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
+ CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+ NewCall->addAttribute(AttributeList::FunctionIndex,
+ Attribute::Convergent);
+ NewCall->takeName(&II);
+ return IC.replaceInstUsesWith(II, NewCall);
+ }
+ }
+ break;
+ }
+ case Intrinsic::amdgcn_wqm_vote: {
+ // wqm_vote is identity when the argument is constant.
+ if (!isa<Constant>(II.getArgOperand(0)))
+ break;
+
+ return IC.replaceInstUsesWith(II, II.getArgOperand(0));
+ }
+ case Intrinsic::amdgcn_kill: {
+ const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
+ if (!C || !C->getZExtValue())
+ break;
+
+ // amdgcn.kill(i1 1) is a no-op
+ return IC.eraseInstFromFunction(II);
+ }
+ case Intrinsic::amdgcn_update_dpp: {
+ Value *Old = II.getArgOperand(0);
+
+ auto *BC = cast<ConstantInt>(II.getArgOperand(5));
+ auto *RM = cast<ConstantInt>(II.getArgOperand(3));
+ auto *BM = cast<ConstantInt>(II.getArgOperand(4));
+ if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
+ BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
+ break;
+
+ // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+ return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
+ }
+ case Intrinsic::amdgcn_permlane16:
+ case Intrinsic::amdgcn_permlanex16: {
+ // Discard vdst_in if it's not going to be read.
+ Value *VDstIn = II.getArgOperand(0);
+ if (isa<UndefValue>(VDstIn))
+ break;
+
+ ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
+ ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
+ if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
+ break;
+
+ return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
+ }
+ case Intrinsic::amdgcn_readfirstlane:
+ case Intrinsic::amdgcn_readlane: {
+ // A constant value is trivially uniform.
+ if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+ return IC.replaceInstUsesWith(II, C);
+ }
+
+ // The rest of these may not be safe if the exec may not be the same between
+ // the def and use.
+ Value *Src = II.getArgOperand(0);
+ Instruction *SrcInst = dyn_cast<Instruction>(Src);
+ if (SrcInst && SrcInst->getParent() != II.getParent())
+ break;
+
+ // readfirstlane (readfirstlane x) -> readfirstlane x
+ // readlane (readfirstlane x), y -> readfirstlane x
+ if (match(Src,
+ PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+
+ if (IID == Intrinsic::amdgcn_readfirstlane) {
+ // readfirstlane (readlane x, y) -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+ } else {
+ // readlane (readlane x, y), y -> readlane x, y
+ if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
+ PatternMatch::m_Value(),
+ PatternMatch::m_Specific(II.getArgOperand(1))))) {
+ return IC.replaceInstUsesWith(II, Src);
+ }
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_ldexp: {
+ // FIXME: This doesn't introduce new instructions and belongs in
+ // InstructionSimplify.
+ Type *Ty = II.getType();
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+
+ // Folding undef to qnan is safe regardless of the FP mode.
+ if (isa<UndefValue>(Op0)) {
+ auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+ return IC.replaceInstUsesWith(II, QNaN);
+ }
+
+ const APFloat *C = nullptr;
+ match(Op0, PatternMatch::m_APFloat(C));
+
+ // FIXME: Should flush denorms depending on FP mode, but that's ignored
+ // everywhere else.
+ //
+ // These cases should be safe, even with strictfp.
+ // ldexp(0.0, x) -> 0.0
+ // ldexp(-0.0, x) -> -0.0
+ // ldexp(inf, x) -> inf
+ // ldexp(-inf, x) -> -inf
+ if (C && (C->isZero() || C->isInfinity())) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ // With strictfp, be more careful about possibly needing to flush denormals
+ // or not, and snan behavior depends on ieee_mode.
+ if (II.isStrictFP())
+ break;
+
+ if (C && C->isNaN()) {
+ // FIXME: We just need to make the nan quiet here, but that's unavailable
+ // on APFloat, only IEEEfloat
+ auto *Quieted =
+ ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
+ return IC.replaceInstUsesWith(II, Quieted);
+ }
+
+ // ldexp(x, 0) -> x
+ // ldexp(x, undef) -> x
+ if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
+ return IC.replaceInstUsesWith(II, Op0);
+ }
+
+ break;
+ }
+ case Intrinsic::amdgcn_fmul_legacy: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+
+ // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+ // infinity, gives +0.0.
+ // TODO: Move to InstSimplify?
+ if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
+ match(Op1, PatternMatch::m_AnyZeroFP()))
+ return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
+
+ // If we can prove we don't have one of the special cases then we can use a
+ // normal fmul instruction instead.
+ if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+ auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
+ FMul->takeName(&II);
+ return IC.replaceInstUsesWith(II, FMul);
+ }
+ break;
+ }
+ case Intrinsic::amdgcn_fma_legacy: {
+ Value *Op0 = II.getArgOperand(0);
+ Value *Op1 = II.getArgOperand(1);
+ Value *Op2 = II.getArgOperand(2);
+
+ // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
+ // infinity, gives +0.0.
+ // TODO: Move to InstSimplify?
+ if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
+ match(Op1, PatternMatch::m_AnyZeroFP())) {
+ // It's tempting to just return Op2 here, but that would give the wrong
+ // result if Op2 was -0.0.
+ auto *Zero = ConstantFP::getNullValue(II.getType());
+ auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
+ FAdd->takeName(&II);
+ return IC.replaceInstUsesWith(II, FAdd);
+ }
+
+ // If we can prove we don't have one of the special cases then we can use a
+ // normal fma instead.
+ if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+ II.setCalledOperand(Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::fma, II.getType()));
+ return &II;
+ }
+ break;
+ }
+ default: {
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
+ return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
+ }
+ }
+ }
+ return None;
+}
+
+/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
+///
+/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
+/// struct returns.
+static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
+ IntrinsicInst &II,
+ APInt DemandedElts,
+ int DMaskIdx = -1) {
+
+ auto *IIVTy = cast<FixedVectorType>(II.getType());
+ unsigned VWidth = IIVTy->getNumElements();
+ if (VWidth == 1)
+ return nullptr;
+
+ IRBuilderBase::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(&II);
+
+ // Assume the arguments are unchanged and later override them, if needed.
+ SmallVector<Value *, 16> Args(II.args());
+
+ if (DMaskIdx < 0) {
+ // Buffer case.
+
+ const unsigned ActiveBits = DemandedElts.getActiveBits();
+ const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
+
+ // Start assuming the prefix of elements is demanded, but possibly clear
+ // some other bits if there are trailing zeros (unused components at front)
+ // and update offset.
+ DemandedElts = (1 << ActiveBits) - 1;
+
+ if (UnusedComponentsAtFront > 0) {
+ static const unsigned InvalidOffsetIdx = 0xf;
+
+ unsigned OffsetIdx;
+ switch (II.getIntrinsicID()) {
+ case Intrinsic::amdgcn_raw_buffer_load:
+ OffsetIdx = 1;
+ break;
+ case Intrinsic::amdgcn_s_buffer_load:
+ // If resulting type is vec3, there is no point in trimming the
+ // load with updated offset, as the vec3 would most likely be widened to
+ // vec4 anyway during lowering.
+ if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
+ OffsetIdx = InvalidOffsetIdx;
+ else
+ OffsetIdx = 1;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_load:
+ OffsetIdx = 2;
+ break;
+ default:
+ // TODO: handle tbuffer* intrinsics.
+ OffsetIdx = InvalidOffsetIdx;
+ break;
+ }
+
+ if (OffsetIdx != InvalidOffsetIdx) {
+ // Clear demanded bits and update the offset.
+ DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
+ auto *Offset = II.getArgOperand(OffsetIdx);
+ unsigned SingleComponentSizeInBits =
+ IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
+ unsigned OffsetAdd =
+ UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
+ auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
+ Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
+ }
+ }
+ } else {
+ // Image case.
+
+ ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
+ unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+
+ // Mask off values that are undefined because the dmask doesn't cover them
+ DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
+
+ unsigned NewDMaskVal = 0;
+ unsigned OrigLoadIdx = 0;
+ for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
+ const unsigned Bit = 1 << SrcIdx;
+ if (!!(DMaskVal & Bit)) {
+ if (!!DemandedElts[OrigLoadIdx])
+ NewDMaskVal |= Bit;
+ OrigLoadIdx++;
+ }
+ }
+
+ if (DMaskVal != NewDMaskVal)
+ Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
+ }
+
+ unsigned NewNumElts = DemandedElts.countPopulation();
+ if (!NewNumElts)
+ return UndefValue::get(II.getType());
+
+ if (NewNumElts >= VWidth && DemandedElts.isMask()) {
+ if (DMaskIdx >= 0)
+ II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
+ return nullptr;
+ }
+
+ // Validate function argument and return types, extracting overloaded types
+ // along the way.
+ SmallVector<Type *, 6> OverloadTys;
+ if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
+ return nullptr;
+
+ Module *M = II.getParent()->getParent()->getParent();
+ Type *EltTy = IIVTy->getElementType();
+ Type *NewTy =
+ (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
+
+ OverloadTys[0] = NewTy;
+ Function *NewIntrin =
+ Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
+
+ CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
+ NewCall->takeName(&II);
+ NewCall->copyMetadata(II);
+
+ if (NewNumElts == 1) {
+ return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
+ NewCall,
+ DemandedElts.countTrailingZeros());
+ }
+
+ SmallVector<int, 8> EltMask;
+ unsigned NewLoadIdx = 0;
+ for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+ if (!!DemandedElts[OrigLoadIdx])
+ EltMask.push_back(NewLoadIdx++);
+ else
+ EltMask.push_back(NewNumElts);
+ }
+
+ Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
+
+ return Shuffle;
+}
+
+Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
+ InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+ APInt &UndefElts2, APInt &UndefElts3,
+ std::function<void(Instruction *, unsigned, APInt, APInt &)>
+ SimplifyAndSetOp) const {
+ switch (II.getIntrinsicID()) {
+ case Intrinsic::amdgcn_buffer_load:
+ case Intrinsic::amdgcn_buffer_load_format:
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_s_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_load_format:
+ case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_tbuffer_load:
+ return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
+ default: {
+ if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
+ return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
+ }
+ break;
+ }
+ }
+ return None;
+}