aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp1766
1 files changed, 1227 insertions, 539 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b754836ea517..cc18e55656ef 100644
--- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -170,12 +170,18 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
}
int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::TargetCostKind CostKind,
TTI::OperandValueKind Op1Info,
TTI::OperandValueKind Op2Info,
TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
+ // TODO: Handle more cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+ Op2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -256,20 +262,25 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// The OperandValue properties may not be the same as that of the previous
// operation; conservatively assume OP_None.
int Cost =
- 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
+ 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
+ Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
+ Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
+ Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
if (ISD == ISD::SREM) {
// For SREM: (X % C) is the equivalent of (X - (X/C)*C)
- Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
- Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
+ Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
+ Op2Info);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
+ Op2Info);
}
return Cost;
@@ -277,12 +288,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// Vector unsigned division/remainder will be simplified to shifts/masks.
if (ISD == ISD::UDIV)
- return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+ return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
+ Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
else // UREM
- return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
+ return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
+ Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
}
@@ -304,6 +317,10 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v2i64, 1 },
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
+
+ { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -370,6 +387,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
{ ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+ { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
+ { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
+ { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
+ { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
+ { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
+ { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
};
if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
@@ -446,11 +471,32 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX512BWShiftCostTable[] = {
+ { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
+
+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+ };
+
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry AVX2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i16, 1 }, // psllw.
{ ISD::SRL, MVT::v16i16, 1 }, // psrlw.
{ ISD::SRA, MVT::v16i16, 1 }, // psraw.
+ { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
+ { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
+ { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
};
if (ST->hasAVX2() &&
@@ -495,18 +541,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWCostTable[] = {
- { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v8i16, 1 }, // vpsravw
-
- { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v16i16, 1 }, // vpsravw
-
- { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
- { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
- { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
-
{ ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
@@ -533,6 +567,7 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
+ { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
@@ -568,6 +603,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRL, MVT::v4i64, 1 },
};
+ if (ST->hasAVX512()) {
+ if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
+ (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+ Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+ // On AVX512, a packed v32i16 shift left by a constant build_vector
+ // is lowered into a vector multiply (vpmullw).
+ return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+ Op1Info, Op2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ }
+
// Look for AVX2 lowering tricks.
if (ST->hasAVX2()) {
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
@@ -575,7 +622,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
// On AVX2, a packed v16i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
- return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
+ return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+ Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -667,13 +715,19 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
static const CostTblEntry AVX2CostTable[] = {
{ ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
{ ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
{ ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
{ ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
@@ -877,20 +931,20 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
ISD == ISD::UDIV || ISD == ISD::UREM)) {
int ScalarCost = getArithmeticInstrCost(
- Opcode, Ty->getScalarType(), Op1Info, Op2Info,
+ Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
}
// Fallback to the default implementation.
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
}
-int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
- Type *SubTp) {
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
+ int Index, VectorType *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
if (Kind == TTI::SK_Transpose)
@@ -919,19 +973,19 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// FIXME: Remove some of the alignment restrictions.
// FIXME: We can use permq for 64-bit or larger extracts from 256-bit
// vectors.
- int OrigSubElts = SubTp->getVectorNumElements();
- if (NumSubElts > OrigSubElts &&
- (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
+ int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
+ if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
+ (NumSubElts % OrigSubElts) == 0 &&
LT.second.getVectorElementType() ==
- SubLT.second.getVectorElementType() &&
+ SubLT.second.getVectorElementType() &&
LT.second.getVectorElementType().getSizeInBits() ==
- Tp->getVectorElementType()->getPrimitiveSizeInBits()) {
+ BaseTp->getElementType()->getPrimitiveSizeInBits()) {
assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
"Unexpected number of elements!");
- Type *VecTy = VectorType::get(Tp->getVectorElementType(),
- LT.second.getVectorNumElements());
- Type *SubTy = VectorType::get(Tp->getVectorElementType(),
- SubLT.second.getVectorNumElements());
+ auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
+ LT.second.getVectorNumElements());
+ auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
+ SubLT.second.getVectorNumElements());
int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
ExtractIndex, SubTy);
@@ -949,6 +1003,42 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
}
}
+ // Handle some common (illegal) sub-vector types as they are often very cheap
+ // to shuffle even on targets without PSHUFB.
+ EVT VT = TLI->getValueType(DL, BaseTp);
+ if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
+ !ST->hasSSSE3()) {
+ static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
+ {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
+ {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
+ {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
+
+ {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
+ {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
+
+ {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
+ {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
+
+ {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
+ {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
+ };
+
+ if (ST->hasSSE2())
+ if (const auto *Entry =
+ CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
+ return Entry->Cost;
+ }
+
// We are going to permute multiple sources and the result will be in multiple
// destinations. Providing an accurate cost only for splits where the element
// type remains the same.
@@ -956,25 +1046,26 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
MVT LegalVT = LT.second;
if (LegalVT.isVector() &&
LegalVT.getVectorElementType().getSizeInBits() ==
- Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
- LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+ BaseTp->getElementType()->getPrimitiveSizeInBits() &&
+ LegalVT.getVectorNumElements() <
+ cast<FixedVectorType>(BaseTp)->getNumElements()) {
- unsigned VecTySize = DL.getTypeStoreSize(Tp);
+ unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
unsigned LegalVTSize = LegalVT.getStoreSize();
// Number of source vectors after legalization:
unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Number of destination vectors after legalization:
unsigned NumOfDests = LT.first;
- Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
- LegalVT.getVectorNumElements());
+ auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
+ LegalVT.getVectorNumElements());
unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
return NumOfShuffles *
getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
}
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
}
// For 2-input shuffles, we must account for splitting the 2 inputs into many.
@@ -992,9 +1083,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
- {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
- {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
- {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1} // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
+ {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
};
if (ST->hasVBMI())
@@ -1006,22 +1097,18 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
- {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
- {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
+ {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
- {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
- {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
+ {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
- {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3}, // vpermw + zext/trunc
- {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpermt2w
- {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3}, // zext + vpermt2w + trunc
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
+ {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
- {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3} // zext + vpermt2w + trunc
};
if (ST->hasBWI())
@@ -1034,6 +1121,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
{TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
+ {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+ {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
{TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
@@ -1065,7 +1154,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
{TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
{TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
- {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1} // vpermt2d
+ {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
+
+ // FIXME: This just applies the type legalization cost rules above
+ // assuming these completely split.
+ {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+ {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
+ {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
+ {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
};
if (ST->hasAVX512())
@@ -1267,14 +1363,22 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
}
int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ // TODO: Allow non-throughput costs that aren't binary.
+ auto AdjustCost = [&CostKind](int Cost) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Cost == 0 ? 0 : 1;
+ return Cost;
+ };
+
// FIXME: Need a better design of the cost table to handle non-simple types of
// potential massive combinations (elem_num x src_type x dst_type).
@@ -1283,6 +1387,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
// Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
@@ -1290,42 +1399,45 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
- // Mask zero extend is a load + broadcast.
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
+ { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
};
static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
- { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
- { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
- { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
- { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
- { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
- { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
- { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
- { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
};
@@ -1337,14 +1449,70 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 },
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
+ { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
+
+ // Sign extend is zmm vpternlogd+vptruncdb.
+ // Zero extend is zmm broadcast load+vptruncdw.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
+
+ // Sign extend is zmm vpternlogd+vptruncdw.
+ // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+
+ { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
+
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
- // v16i1 -> v16i32 - load + broadcast
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
@@ -1356,6 +1524,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
@@ -1367,44 +1538,163 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
+ // Mask sign extend has an instruction.
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
+
+ // Mask zero extend is a sext + shift.
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+ { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // vpsllw+vptestmb
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // vpsllw+vptestmw
+ { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // vpsllw+vptestmb
+ };
+
+ static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
+ { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
+ };
+
+ static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
+
+ // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
+ // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
+ { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
+ { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
+ { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
+
+ // sign extend is vpcmpeq+maskedmove+vpmovdw
+ // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
+ { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
+
+ { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
+
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
+
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
+
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
- { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
- { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 },
- { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 },
- { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
- { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 },
- { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 },
};
static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -1416,6 +1706,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
@@ -1424,13 +1716,16 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 4 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
@@ -1447,6 +1742,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
@@ -1456,15 +1753,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
+
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
@@ -1503,8 +1806,15 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 7 },
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
// This node is expanded into scalarized operations but BasicTTI is overly
// optimistic estimating its cost. It computes 3 per element (one
// vector-extract, one scalar conversion and one vector-insert). The
@@ -1544,7 +1854,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 },
+ // These truncates end up widening elements.
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
+
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
@@ -1555,6 +1871,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
+
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
+
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
};
static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
@@ -1580,16 +1903,26 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
- { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
@@ -1616,11 +1949,19 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+ // These truncates are really widening elements.
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ
+ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
+
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 },
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
{ ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
@@ -1639,7 +1980,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
if (ST->hasSSE2() && !ST->hasAVX()) {
if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
LTDest.second, LTSrc.second))
- return LTSrc.first * Entry->Cost;
+ return AdjustCost(LTSrc.first * Entry->Cost);
}
EVT SrcTy = TLI->getValueType(DL, Src);
@@ -1647,61 +1988,77 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
// The function getSimpleVT only handles simple value types.
if (!SrcTy.isSimple() || !DstTy.isSimple())
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind));
MVT SimpleSrcTy = SrcTy.getSimpleVT();
MVT SimpleDstTy = DstTy.getSimpleVT();
- // Make sure that neither type is going to be split before using the
- // AVX512 tables. This handles -mprefer-vector-width=256
- // with -min-legal-vector-width<=256
- if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
- TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
+ if (ST->useAVX512Regs()) {
if (ST->hasBWI())
if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
if (ST->hasDQI())
if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
if (ST->hasAVX512())
if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
if (ST->hasAVX2()) {
if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
if (ST->hasAVX()) {
if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
if (ST->hasSSE41()) {
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
if (ST->hasSSE2()) {
if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
- return Entry->Cost;
+ return AdjustCost(Entry->Cost);
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+ return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
}
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -1774,6 +2131,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
{ ISD::SELECT, MVT::v16i32, 1 },
{ ISD::SELECT, MVT::v8f64, 1 },
{ ISD::SELECT, MVT::v16f32, 1 },
+
+ { ISD::SETCC, MVT::v32i16, 2 }, // FIXME: should probably be 4
+ { ISD::SETCC, MVT::v64i8, 2 }, // FIXME: should probably be 4
+
+ { ISD::SELECT, MVT::v32i16, 2 }, // FIXME: should be 3
+ { ISD::SELECT, MVT::v64i8, 2 }, // FIXME: should be 3
};
static const CostTblEntry AVX2CostTbl[] = {
@@ -1878,14 +2241,14 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * (ExtraCost + Entry->Cost);
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
}
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
-int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Type *> Tys, FastMathFlags FMF,
- unsigned ScalarizationCostPassed) {
+int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
+ const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
+
// Costs should match the codegen from:
// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
// BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
@@ -1935,12 +2298,20 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
static const CostTblEntry AVX512CostTbl[] = {
{ ISD::BITREVERSE, MVT::v8i64, 36 },
{ ISD::BITREVERSE, MVT::v16i32, 24 },
+ { ISD::BITREVERSE, MVT::v32i16, 10 },
+ { ISD::BITREVERSE, MVT::v64i8, 10 },
{ ISD::CTLZ, MVT::v8i64, 29 },
{ ISD::CTLZ, MVT::v16i32, 35 },
+ { ISD::CTLZ, MVT::v32i16, 28 },
+ { ISD::CTLZ, MVT::v64i8, 18 },
{ ISD::CTPOP, MVT::v8i64, 16 },
{ ISD::CTPOP, MVT::v16i32, 24 },
+ { ISD::CTPOP, MVT::v32i16, 18 },
+ { ISD::CTPOP, MVT::v64i8, 12 },
{ ISD::CTTZ, MVT::v8i64, 20 },
{ ISD::CTTZ, MVT::v16i32, 28 },
+ { ISD::CTTZ, MVT::v32i16, 24 },
+ { ISD::CTTZ, MVT::v64i8, 18 },
{ ISD::USUBSAT, MVT::v16i32, 2 }, // pmaxud + psubd
{ ISD::USUBSAT, MVT::v2i64, 2 }, // pmaxuq + psubq
{ ISD::USUBSAT, MVT::v4i64, 2 }, // pmaxuq + psubq
@@ -1949,6 +2320,22 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::UADDSAT, MVT::v2i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v4i64, 3 }, // not + pminuq + paddq
{ ISD::UADDSAT, MVT::v8i64, 3 }, // not + pminuq + paddq
+ { ISD::SADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::SSUBSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::UADDSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v32i16, 2 }, // FIXME: include split
+ { ISD::USUBSAT, MVT::v64i8, 2 }, // FIXME: include split
+ { ISD::FMAXNUM, MVT::f32, 2 },
+ { ISD::FMAXNUM, MVT::v4f32, 2 },
+ { ISD::FMAXNUM, MVT::v8f32, 2 },
+ { ISD::FMAXNUM, MVT::v16f32, 2 },
+ { ISD::FMAXNUM, MVT::f64, 2 },
+ { ISD::FMAXNUM, MVT::v2f64, 2 },
+ { ISD::FMAXNUM, MVT::v4f64, 2 },
+ { ISD::FMAXNUM, MVT::v8f64, 2 },
};
static const CostTblEntry XOPCostTbl[] = {
{ ISD::BITREVERSE, MVT::v4i64, 4 },
@@ -2031,6 +2418,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::USUBSAT, MVT::v16i16, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v32i8, 4 }, // 2 x 128-bit Op + extract/insert
{ ISD::USUBSAT, MVT::v8i32, 6 }, // 2 x 128-bit Op + extract/insert
+ { ISD::FMAXNUM, MVT::f32, 3 },
+ { ISD::FMAXNUM, MVT::v4f32, 3 },
+ { ISD::FMAXNUM, MVT::v8f32, 5 },
+ { ISD::FMAXNUM, MVT::f64, 3 },
+ { ISD::FMAXNUM, MVT::v2f64, 3 },
+ { ISD::FMAXNUM, MVT::v4f64, 5 },
{ ISD::FSQRT, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FSQRT, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
@@ -2105,13 +2498,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::UADDSAT, MVT::v16i8, 1 },
{ ISD::USUBSAT, MVT::v8i16, 1 },
{ ISD::USUBSAT, MVT::v16i8, 1 },
+ { ISD::FMAXNUM, MVT::f64, 4 },
+ { ISD::FMAXNUM, MVT::v2f64, 4 },
{ ISD::FSQRT, MVT::f64, 32 }, // Nehalem from http://www.agner.org/
{ ISD::FSQRT, MVT::v2f64, 32 }, // Nehalem from http://www.agner.org/
};
static const CostTblEntry SSE1CostTbl[] = {
+ { ISD::FMAXNUM, MVT::f32, 4 },
+ { ISD::FMAXNUM, MVT::v4f32, 4 },
{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
+ static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
+ { ISD::CTTZ, MVT::i64, 1 },
+ };
+ static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
+ { ISD::CTTZ, MVT::i32, 1 },
+ { ISD::CTTZ, MVT::i16, 1 },
+ { ISD::CTTZ, MVT::i8, 1 },
+ };
static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
{ ISD::CTLZ, MVT::i64, 1 },
};
@@ -2131,6 +2536,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::BITREVERSE, MVT::i64, 14 },
{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },
@@ -2142,6 +2548,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
+ { ISD::CTTZ, MVT::i32, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i16, 3 }, // TEST+BSF+CMOV/BRANCH
+ { ISD::CTTZ, MVT::i8, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i32, 8 },
{ ISD::CTPOP, MVT::i16, 9 },
{ ISD::CTPOP, MVT::i8, 7 },
@@ -2153,7 +2562,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::UADDO, MVT::i8, 1 },
};
+ Type *RetTy = ICA.getReturnType();
Type *OpTy = RetTy;
+ Intrinsic::ID IID = ICA.getID();
unsigned ISD = ISD::DELETED_NODE;
switch (IID) {
default:
@@ -2173,6 +2584,11 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
case Intrinsic::cttz:
ISD = ISD::CTTZ;
break;
+ case Intrinsic::maxnum:
+ case Intrinsic::minnum:
+ // FMINNUM has same costs so don't duplicate.
+ ISD = ISD::FMAXNUM;
+ break;
case Intrinsic::sadd_sat:
ISD = ISD::SADDSAT;
break;
@@ -2256,6 +2672,15 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
+ if (ST->hasBMI()) {
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
if (ST->hasLZCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
@@ -2284,12 +2709,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
return LT.first * Entry->Cost;
}
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
- ArrayRef<Value *> Args, FastMathFlags FMF,
- unsigned VF) {
+int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+ if (ICA.isTypeBasedOnly())
+ return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
static const CostTblEntry AVX512CostTbl[] = {
{ ISD::ROTL, MVT::v8i64, 1 },
{ ISD::ROTL, MVT::v4i64, 1 },
@@ -2340,6 +2770,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSHL, MVT::i8, 4 }
};
+ Intrinsic::ID IID = ICA.getID();
+ Type *RetTy = ICA.getReturnType();
+ const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
unsigned ISD = ISD::DELETED_NODE;
switch (IID) {
default:
@@ -2379,7 +2812,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
return LT.first * Entry->Cost;
}
- return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -2391,10 +2824,11 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
};
assert(Val->isVectorTy() && "This must be a vector type");
-
Type *ScalarType = Val->getScalarType();
+ int RegisterFileMoveCost = 0;
- if (Index != -1U) {
+ if (Index != -1U && (Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::InsertElement)) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
@@ -2403,17 +2837,32 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return 0;
// The type may be split. Normalize the index to the new type.
- unsigned Width = LT.second.getVectorNumElements();
- Index = Index % Width;
+ unsigned NumElts = LT.second.getVectorNumElements();
+ unsigned SubNumElts = NumElts;
+ Index = Index % NumElts;
+
+ // For >128-bit vectors, we need to extract higher 128-bit subvectors.
+ // For inserts, we also need to insert the subvector back.
+ if (LT.second.getSizeInBits() > 128) {
+ assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
+ unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+ SubNumElts = NumElts / NumSubVecs;
+ if (SubNumElts <= Index) {
+ RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
+ Index %= SubNumElts;
+ }
+ }
if (Index == 0) {
// Floating point scalars are already located in index #0.
+ // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
+ // true for all.
if (ScalarType->isFloatingPointTy())
- return 0;
+ return RegisterFileMoveCost;
- // Assume movd/movq XMM <-> GPR is relatively cheap on all targets.
- if (ScalarType->isIntegerTy())
- return 1;
+ // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
+ if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
+ return 1 + RegisterFileMoveCost;
}
int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -2421,24 +2870,124 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
MVT MScalarTy = LT.second.getScalarType();
if (ST->isSLM())
if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost + RegisterFileMoveCost;
+
+ // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+ if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+ (MScalarTy.isInteger() && ST->hasSSE41()))
+ return 1 + RegisterFileMoveCost;
+
+ // Assume insertps is relatively cheap on all targets.
+ if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
+ Opcode == Instruction::InsertElement)
+ return 1 + RegisterFileMoveCost;
+
+ // For extractions we just need to shuffle the element to index 0, which
+ // should be very cheap (assume cost = 1). For insertions we need to shuffle
+ // the elements to its destination. In both cases we must handle the
+ // subvector move(s).
+ // If the vector type is already less than 128-bits then don't reduce it.
+ // TODO: Under what circumstances should we shuffle using the full width?
+ int ShuffleCost = 1;
+ if (Opcode == Instruction::InsertElement) {
+ auto *SubTy = cast<VectorType>(Val);
+ EVT VT = TLI->getValueType(DL, Val);
+ if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
+ SubTy = FixedVectorType::get(ScalarType, SubNumElts);
+ ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+ }
+ int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
+ return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
}
// Add to the base cost if we know that the extracted element of a vector is
// destined to be moved to and used in the integer register file.
- int RegisterFileMoveCost = 0;
if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
- RegisterFileMoveCost = 1;
+ RegisterFileMoveCost += 1;
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}
+unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert, bool Extract) {
+ unsigned Cost = 0;
+
+ // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
+ // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
+ if (Insert) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ MVT MScalarTy = LT.second.getScalarType();
+
+ if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+ (MScalarTy.isInteger() && ST->hasSSE41()) ||
+ (MScalarTy == MVT::f32 && ST->hasSSE41())) {
+ // For types we can insert directly, insertion into 128-bit sub vectors is
+ // cheap, followed by a cheap chain of concatenations.
+ if (LT.second.getSizeInBits() <= 128) {
+ Cost +=
+ BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
+ } else {
+ unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+ Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
+ Cost += DemandedElts.countPopulation();
+
+ // For vXf32 cases, insertion into the 0'th index in each v4f32
+ // 128-bit vector is free.
+ // NOTE: This assumes legalization widens vXf32 vectors.
+ if (MScalarTy == MVT::f32)
+ for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
+ i < e; i += 4)
+ if (DemandedElts[i])
+ Cost--;
+ }
+ } else if (LT.second.isVector()) {
+ // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
+ // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
+ // series of UNPCK followed by CONCAT_VECTORS - all of these can be
+ // considered cheap.
+ if (Ty->isIntOrIntVectorTy())
+ Cost += DemandedElts.countPopulation();
+
+ // Get the smaller of the legalized or original pow2-extended number of
+ // vector elements, which represents the number of unpacks we'll end up
+ // performing.
+ unsigned NumElts = LT.second.getVectorNumElements();
+ unsigned Pow2Elts =
+ PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
+ Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
+ }
+ }
+
+ // TODO: Use default extraction for now, but we should investigate extending this
+ // to handle repeated subvector extraction.
+ if (Extract)
+ Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+
+ return Cost;
+}
+
int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
MaybeAlign Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
const Instruction *I) {
+ // TODO: Handle other cost kinds.
+ if (CostKind != TTI::TCK_RecipThroughput) {
+ if (isa_and_nonnull<StoreInst>(I)) {
+ Value *Ptr = I->getOperand(1);
+ // Store instruction with index and scale costs 2 Uops.
+ // Check the preceding GEP to identify non-const indices.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+ if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
+ return TTI::TCC_Basic * 2;
+ }
+ }
+ return TTI::TCC_Basic;
+ }
+
// Handle non-power-of-two vectors such as <3 x float>
- if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
- unsigned NumElem = VTy->getVectorNumElements();
+ if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
+ unsigned NumElem = VTy->getNumElements();
// Handle a few common cases:
// <3 x float>
@@ -2453,14 +3002,21 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
// Assume that all other non-power-of-two numbers are scalarized.
if (!isPowerOf2_32(NumElem)) {
+ APInt DemandedElts = APInt::getAllOnesValue(NumElem);
int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
- AddressSpace);
- int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+ AddressSpace, CostKind);
+ int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
+ Opcode == Instruction::Load,
Opcode == Instruction::Store);
return NumElem * Cost + SplitCost;
}
}
+ // Type legalization can't handle structs
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -2478,33 +3034,36 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
}
int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
- unsigned Alignment,
- unsigned AddressSpace) {
+ Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
bool IsLoad = (Instruction::Load == Opcode);
bool IsStore = (Instruction::Store == Opcode);
- VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+ auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
if (!SrcVTy)
// To calculate scalar take the regular cost, without mask
- return getMemoryOpCost(Opcode, SrcTy, MaybeAlign(Alignment), AddressSpace);
+ return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
- unsigned NumElem = SrcVTy->getVectorNumElements();
- VectorType *MaskTy =
- VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
- if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) ||
- (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
+ unsigned NumElem = SrcVTy->getNumElements();
+ auto *MaskTy =
+ FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+ if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
+ (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
!isPowerOf2_32(NumElem)) {
// Scalarization
- int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+ APInt DemandedElts = APInt::getAllOnesValue(NumElem);
+ int MaskSplitCost =
+ getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost = getCmpSelInstrCost(
- Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
- int BranchCost = getCFInstrCost(Instruction::Br);
+ Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
+ CostKind);
+ int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
-
- int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
+ int ValueSplitCost =
+ getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace);
+ Alignment, AddressSpace, CostKind);
return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
}
@@ -2519,8 +3078,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
else if (LT.second.getVectorNumElements() > NumElem) {
- VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
- LT.second.getVectorNumElements());
+ auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
+ LT.second.getVectorNumElements());
// Expanding requires fill mask with zeroes
Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
}
@@ -2558,41 +3117,16 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
-int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
- bool IsPairwise) {
+int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ bool IsPairwise,
+ TTI::TargetCostKind CostKind) {
+ // Just use the default implementation for pair reductions.
+ if (IsPairwise)
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
+
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
- static const CostTblEntry SLMCostTblPairWise[] = {
- { ISD::FADD, MVT::v2f64, 3 },
- { ISD::ADD, MVT::v2i64, 5 },
- };
-
- static const CostTblEntry SSE2CostTblPairWise[] = {
- { ISD::FADD, MVT::v2f64, 2 },
- { ISD::FADD, MVT::v4f32, 4 },
- { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
- { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32.
- { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5".
- { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16
- { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16
- { ISD::ADD, MVT::v8i16, 5 },
- { ISD::ADD, MVT::v2i8, 2 },
- { ISD::ADD, MVT::v4i8, 2 },
- { ISD::ADD, MVT::v8i8, 2 },
- { ISD::ADD, MVT::v16i8, 3 },
- };
-
- static const CostTblEntry AVX1CostTblPairWise[] = {
- { ISD::FADD, MVT::v4f64, 5 },
- { ISD::FADD, MVT::v8f32, 7 },
- { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5".
- { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8".
- { ISD::ADD, MVT::v8i32, 5 },
- { ISD::ADD, MVT::v16i16, 6 },
- { ISD::ADD, MVT::v32i8, 4 },
- };
-
static const CostTblEntry SLMCostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 3 },
{ ISD::ADD, MVT::v2i64, 5 },
@@ -2633,66 +3167,49 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
EVT VT = TLI->getValueType(DL, ValTy);
if (VT.isSimple()) {
MVT MTy = VT.getSimpleVT();
- if (IsPairwise) {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
- return Entry->Cost;
-
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
- return Entry->Cost;
-
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
- return Entry->Cost;
- } else {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
- return Entry->Cost;
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
- return Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
- return Entry->Cost;
- }
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
}
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
- if (IsPairwise) {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ auto *ValVTy = cast<FixedVectorType>(ValTy);
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ unsigned ArithmeticCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+ ArithmeticCost *= LT.first - 1;
+ }
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
- } else {
- if (ST->isSLM())
- if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
- }
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return ArithmeticCost + Entry->Cost;
// FIXME: These assume a naive kshift+binop lowering, which is probably
// conservative in most cases.
- // FIXME: This doesn't cost large types like v128i1 correctly.
static const CostTblEntry AVX512BoolReduction[] = {
{ ISD::AND, MVT::v2i1, 3 },
{ ISD::AND, MVT::v4i1, 5 },
@@ -2738,252 +3255,408 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
};
// Handle bool allof/anyof patterns.
- if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) {
+ if (ValVTy->getElementType()->isIntegerTy(1)) {
+ unsigned ArithmeticCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+ ArithmeticCost *= LT.first - 1;
+ }
+
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
- return LT.first * Entry->Cost;
+ return ArithmeticCost + Entry->Cost;
+
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+ CostKind);
+ }
+
+ unsigned NumVecElts = ValVTy->getNumElements();
+ unsigned ScalarSize = ValVTy->getScalarSizeInBits();
+
+ // Special case power of 2 reductions where the scalar type isn't changed
+ // by type legalization.
+ if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+ CostKind);
+
+ unsigned ReductionCost = 0;
+
+ auto *Ty = ValVTy;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 arithmetic ops.
+ Ty = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
+ ReductionCost *= LT.first - 1;
+ NumVecElts = MTy.getVectorNumElements();
+ }
+
+ // Now handle reduction with the legal type, taking into account size changes
+ // at each level.
+ while (NumVecElts > 1) {
+ // Determine the size of the remaining vector we need to reduce.
+ unsigned Size = NumVecElts * ScalarSize;
+ NumVecElts /= 2;
+ // If we're reducing from 256/512 bits, use an extract_subvector.
+ if (Size > 128) {
+ auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ Ty = SubTy;
+ } else if (Size == 128) {
+ // Reducing from 128 bits is a permute of v2f64/v2i64.
+ FixedVectorType *ShufTy;
+ if (ValVTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
+ else
+ ShufTy =
+ FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else if (Size == 64) {
+ // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+ FixedVectorType *ShufTy;
+ if (ValVTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
+ else
+ ShufTy =
+ FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
+ ReductionCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else {
+ // Reducing from smaller size is a shift by immediate.
+ auto *ShiftTy = FixedVectorType::get(
+ Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
+ ReductionCost += getArithmeticInstrCost(
+ Instruction::LShr, ShiftTy, CostKind,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_UniformConstantValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ }
+
+ // Add the arithmetic op for this level.
+ ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
}
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+ // Add the final extract element to the cost.
+ return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
-int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
- bool IsPairwise, bool IsUnsigned) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MTy = LT.second;
int ISD;
- if (ValTy->isIntOrIntVectorTy()) {
+ if (Ty->isIntOrIntVectorTy()) {
ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
} else {
- assert(ValTy->isFPOrFPVectorTy() &&
+ assert(Ty->isFPOrFPVectorTy() &&
"Expected float point or integer vector type.");
ISD = ISD::FMINNUM;
}
- // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
- // and make it as the cost.
+ static const CostTblEntry SSE1CostTbl[] = {
+ {ISD::FMINNUM, MVT::v4f32, 1},
+ };
- static const CostTblEntry SSE1CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 4},
- };
-
- static const CostTblEntry SSE2CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v2f64, 3},
- {ISD::SMIN, MVT::v2i64, 6},
- {ISD::UMIN, MVT::v2i64, 8},
- {ISD::SMIN, MVT::v4i32, 6},
- {ISD::UMIN, MVT::v4i32, 8},
- {ISD::SMIN, MVT::v8i16, 4},
- {ISD::UMIN, MVT::v8i16, 6},
- {ISD::SMIN, MVT::v16i8, 8},
- {ISD::UMIN, MVT::v16i8, 6},
- };
-
- static const CostTblEntry SSE41CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 2},
- {ISD::SMIN, MVT::v2i64, 9},
- {ISD::UMIN, MVT::v2i64,10},
- {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
- {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
- {ISD::SMIN, MVT::v8i16, 2},
- {ISD::UMIN, MVT::v8i16, 2},
- {ISD::SMIN, MVT::v16i8, 3},
- {ISD::UMIN, MVT::v16i8, 3},
- };
-
- static const CostTblEntry SSE42CostTblPairWise[] = {
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
- };
-
- static const CostTblEntry AVX1CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 1},
- {ISD::FMINNUM, MVT::v4f64, 1},
- {ISD::FMINNUM, MVT::v8f32, 2},
- {ISD::SMIN, MVT::v2i64, 3},
- {ISD::UMIN, MVT::v2i64, 3},
- {ISD::SMIN, MVT::v4i32, 1},
- {ISD::UMIN, MVT::v4i32, 1},
- {ISD::SMIN, MVT::v8i16, 1},
- {ISD::UMIN, MVT::v8i16, 1},
- {ISD::SMIN, MVT::v16i8, 2},
- {ISD::UMIN, MVT::v16i8, 2},
- {ISD::SMIN, MVT::v4i64, 7},
- {ISD::UMIN, MVT::v4i64, 7},
- {ISD::SMIN, MVT::v8i32, 3},
- {ISD::UMIN, MVT::v8i32, 3},
- {ISD::SMIN, MVT::v16i16, 3},
- {ISD::UMIN, MVT::v16i16, 3},
- {ISD::SMIN, MVT::v32i8, 3},
- {ISD::UMIN, MVT::v32i8, 3},
- };
-
- static const CostTblEntry AVX2CostTblPairWise[] = {
- {ISD::SMIN, MVT::v4i64, 2},
- {ISD::UMIN, MVT::v4i64, 2},
- {ISD::SMIN, MVT::v8i32, 1},
- {ISD::UMIN, MVT::v8i32, 1},
- {ISD::SMIN, MVT::v16i16, 1},
- {ISD::UMIN, MVT::v16i16, 1},
- {ISD::SMIN, MVT::v32i8, 2},
- {ISD::UMIN, MVT::v32i8, 2},
- };
-
- static const CostTblEntry AVX512CostTblPairWise[] = {
- {ISD::FMINNUM, MVT::v8f64, 1},
- {ISD::FMINNUM, MVT::v16f32, 2},
- {ISD::SMIN, MVT::v8i64, 2},
- {ISD::UMIN, MVT::v8i64, 2},
- {ISD::SMIN, MVT::v16i32, 1},
- {ISD::UMIN, MVT::v16i32, 1},
- };
-
- static const CostTblEntry SSE1CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 4},
+ static const CostTblEntry SSE2CostTbl[] = {
+ {ISD::FMINNUM, MVT::v2f64, 1},
+ {ISD::SMIN, MVT::v8i16, 1},
+ {ISD::UMIN, MVT::v16i8, 1},
};
- static const CostTblEntry SSE2CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v2f64, 3},
- {ISD::SMIN, MVT::v2i64, 6},
- {ISD::UMIN, MVT::v2i64, 8},
- {ISD::SMIN, MVT::v4i32, 6},
- {ISD::UMIN, MVT::v4i32, 8},
- {ISD::SMIN, MVT::v8i16, 4},
- {ISD::UMIN, MVT::v8i16, 6},
- {ISD::SMIN, MVT::v16i8, 8},
- {ISD::UMIN, MVT::v16i8, 6},
+ static const CostTblEntry SSE41CostTbl[] = {
+ {ISD::SMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v16i8, 1},
};
- static const CostTblEntry SSE41CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 3},
- {ISD::SMIN, MVT::v2i64, 9},
- {ISD::UMIN, MVT::v2i64,11},
- {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
- {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
- {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
- {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
- {ISD::SMIN, MVT::v16i8, 3},
- {ISD::UMIN, MVT::v16i8, 3},
+ static const CostTblEntry SSE42CostTbl[] = {
+ {ISD::UMIN, MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
};
- static const CostTblEntry SSE42CostTblNoPairWise[] = {
- {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
- {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
+ static const CostTblEntry AVX1CostTbl[] = {
+ {ISD::FMINNUM, MVT::v8f32, 1},
+ {ISD::FMINNUM, MVT::v4f64, 1},
+ {ISD::SMIN, MVT::v8i32, 3},
+ {ISD::UMIN, MVT::v8i32, 3},
+ {ISD::SMIN, MVT::v16i16, 3},
+ {ISD::UMIN, MVT::v16i16, 3},
+ {ISD::SMIN, MVT::v32i8, 3},
+ {ISD::UMIN, MVT::v32i8, 3},
};
- static const CostTblEntry AVX1CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v4f32, 1},
- {ISD::FMINNUM, MVT::v4f64, 1},
- {ISD::FMINNUM, MVT::v8f32, 1},
- {ISD::SMIN, MVT::v2i64, 3},
- {ISD::UMIN, MVT::v2i64, 3},
- {ISD::SMIN, MVT::v4i32, 1},
- {ISD::UMIN, MVT::v4i32, 1},
- {ISD::SMIN, MVT::v8i16, 1},
- {ISD::UMIN, MVT::v8i16, 1},
- {ISD::SMIN, MVT::v16i8, 2},
- {ISD::UMIN, MVT::v16i8, 2},
- {ISD::SMIN, MVT::v4i64, 7},
- {ISD::UMIN, MVT::v4i64, 7},
- {ISD::SMIN, MVT::v8i32, 2},
- {ISD::UMIN, MVT::v8i32, 2},
- {ISD::SMIN, MVT::v16i16, 2},
- {ISD::UMIN, MVT::v16i16, 2},
- {ISD::SMIN, MVT::v32i8, 2},
- {ISD::UMIN, MVT::v32i8, 2},
- };
-
- static const CostTblEntry AVX2CostTblNoPairWise[] = {
- {ISD::SMIN, MVT::v4i64, 1},
- {ISD::UMIN, MVT::v4i64, 1},
- {ISD::SMIN, MVT::v8i32, 1},
- {ISD::UMIN, MVT::v8i32, 1},
- {ISD::SMIN, MVT::v16i16, 1},
- {ISD::UMIN, MVT::v16i16, 1},
- {ISD::SMIN, MVT::v32i8, 1},
- {ISD::UMIN, MVT::v32i8, 1},
- };
-
- static const CostTblEntry AVX512CostTblNoPairWise[] = {
- {ISD::FMINNUM, MVT::v8f64, 1},
- {ISD::FMINNUM, MVT::v16f32, 2},
- {ISD::SMIN, MVT::v8i64, 1},
- {ISD::UMIN, MVT::v8i64, 1},
- {ISD::SMIN, MVT::v16i32, 1},
- {ISD::UMIN, MVT::v16i32, 1},
- };
-
- if (IsPairwise) {
- if (ST->hasAVX512())
- if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX2CostTbl[] = {
+ {ISD::SMIN, MVT::v8i32, 1},
+ {ISD::UMIN, MVT::v8i32, 1},
+ {ISD::SMIN, MVT::v16i16, 1},
+ {ISD::UMIN, MVT::v16i16, 1},
+ {ISD::SMIN, MVT::v32i8, 1},
+ {ISD::UMIN, MVT::v32i8, 1},
+ };
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX512CostTbl[] = {
+ {ISD::FMINNUM, MVT::v16f32, 1},
+ {ISD::FMINNUM, MVT::v8f64, 1},
+ {ISD::SMIN, MVT::v2i64, 1},
+ {ISD::UMIN, MVT::v2i64, 1},
+ {ISD::SMIN, MVT::v4i64, 1},
+ {ISD::UMIN, MVT::v4i64, 1},
+ {ISD::SMIN, MVT::v8i64, 1},
+ {ISD::UMIN, MVT::v8i64, 1},
+ {ISD::SMIN, MVT::v16i32, 1},
+ {ISD::UMIN, MVT::v16i32, 1},
+ };
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ static const CostTblEntry AVX512BWCostTbl[] = {
+ {ISD::SMIN, MVT::v32i16, 1},
+ {ISD::UMIN, MVT::v32i16, 1},
+ {ISD::SMIN, MVT::v64i8, 1},
+ {ISD::UMIN, MVT::v64i8, 1},
+ };
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ // If we have a native MIN/MAX instruction for this type, use it.
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE41())
- if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE2())
- if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
- if (ST->hasSSE1())
- if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE1())
+ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ unsigned CmpOpcode;
+ if (Ty->isFPOrFPVectorTy()) {
+ CmpOpcode = Instruction::FCmp;
} else {
- if (ST->hasAVX512())
- if (const auto *Entry =
- CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ assert(Ty->isIntOrIntVectorTy() &&
+ "expecting floating point or integer type for min/max reduction");
+ CmpOpcode = Instruction::ICmp;
+ }
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Otherwise fall back to cmp+select.
+ return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
+ getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind);
+}
+
+int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
+ bool IsPairwise, bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
+ // Just use the default implementation for pair reductions.
+ if (IsPairwise)
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
+
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD;
+ if (ValTy->isIntOrIntVectorTy()) {
+ ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+ } else {
+ assert(ValTy->isFPOrFPVectorTy() &&
+ "Expected float point or integer vector type.");
+ ISD = ISD::FMINNUM;
+ }
+
+ // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+ // and make it as the cost.
+
+ static const CostTblEntry SSE2CostTblNoPairWise[] = {
+ {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
+ {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
+ {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
+ };
+
+ static const CostTblEntry SSE41CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
+ {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
+ {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
+ {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
+ {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
+ {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v2i8, 3}, // pminsb
+ {ISD::SMIN, MVT::v4i8, 5}, // pminsb
+ {ISD::SMIN, MVT::v8i8, 7}, // pminsb
+ {ISD::SMIN, MVT::v16i8, 6},
+ {ISD::UMIN, MVT::v2i8, 3}, // same as sse2
+ {ISD::UMIN, MVT::v4i8, 5}, // same as sse2
+ {ISD::UMIN, MVT::v8i8, 7}, // same as sse2
+ {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
+ };
+
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v16i16, 6},
+ {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v32i8, 8},
+ {ISD::UMIN, MVT::v32i8, 8},
+ };
+
+ static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v32i16, 8},
+ {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
+ {ISD::SMIN, MVT::v64i8, 10},
+ {ISD::UMIN, MVT::v64i8, 10},
+ };
+
+ // Before legalizing the type, give a chance to look up illegal narrow types
+ // in the table.
+ // FIXME: Is there a better way to do this?
+ EVT VT = TLI->getValueType(DL, ValTy);
+ if (VT.isSimple()) {
+ MVT MTy = VT.getSimpleVT();
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
-
- if (ST->hasSSE42())
- if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost;
if (ST->hasSSE41())
if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost;
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ return Entry->Cost;
+ }
- if (ST->hasSSE1())
- if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
- return LT.first * Entry->Cost;
+ auto *ValVTy = cast<FixedVectorType>(ValTy);
+ unsigned NumVecElts = ValVTy->getNumElements();
+
+ auto *Ty = ValVTy;
+ unsigned MinMaxCost = 0;
+ if (LT.first != 1 && MTy.isVector() &&
+ MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+ // Type needs to be split. We need LT.first - 1 operations ops.
+ Ty = FixedVectorType::get(ValVTy->getElementType(),
+ MTy.getVectorNumElements());
+ auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
+ MTy.getVectorNumElements());
+ MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+ MinMaxCost *= LT.first - 1;
+ NumVecElts = MTy.getVectorNumElements();
}
- return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
+ if (ST->hasBWI())
+ if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ if (ST->hasSSE2())
+ if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+ return MinMaxCost + Entry->Cost;
+
+ unsigned ScalarSize = ValTy->getScalarSizeInBits();
+
+ // Special case power of 2 reductions where the scalar type isn't changed
+ // by type legalization.
+ if (!isPowerOf2_32(ValVTy->getNumElements()) ||
+ ScalarSize != MTy.getScalarSizeInBits())
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+ CostKind);
+
+ // Now handle reduction with the legal type, taking into account size changes
+ // at each level.
+ while (NumVecElts > 1) {
+ // Determine the size of the remaining vector we need to reduce.
+ unsigned Size = NumVecElts * ScalarSize;
+ NumVecElts /= 2;
+ // If we're reducing from 256/512 bits, use an extract_subvector.
+ if (Size > 128) {
+ auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ Ty = SubTy;
+ } else if (Size == 128) {
+ // Reducing from 128 bits is a permute of v2f64/v2i64.
+ VectorType *ShufTy;
+ if (ValTy->isFloatingPointTy())
+ ShufTy =
+ FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
+ else
+ ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else if (Size == 64) {
+ // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+ FixedVectorType *ShufTy;
+ if (ValTy->isFloatingPointTy())
+ ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
+ else
+ ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
+ MinMaxCost +=
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ } else {
+ // Reducing from smaller size is a shift by immediate.
+ auto *ShiftTy = FixedVectorType::get(
+ Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
+ MinMaxCost += getArithmeticInstrCost(
+ Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_UniformConstantValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+ }
+
+ // Add the arithmetic op for this level.
+ auto *SubCondTy =
+ FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
+ MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+ }
+
+ // Add the final extract element to the cost.
+ return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
/// Calculate the cost of materializing a 64-bit value. This helper
@@ -2999,7 +3672,8 @@ int X86TTIImpl::getIntImmCost(int64_t Val) {
return 2 * TTI::TCC_Basic;
}
-int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3034,7 +3708,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
}
int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
- Type *Ty) {
+ Type *Ty, TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3121,17 +3795,18 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
if (Idx == ImmIdx) {
int NumConstants = divideCeil(BitSize, 64);
- int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+ int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
}
- return X86TTIImpl::getIntImmCost(Imm, Ty);
+ return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty) {
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3162,52 +3837,45 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
return TTI::TCC_Free;
break;
}
- return X86TTIImpl::getIntImmCost(Imm, Ty);
+ return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-unsigned X86TTIImpl::getUserCost(const User *U,
- ArrayRef<const Value *> Operands) {
- if (isa<StoreInst>(U)) {
- Value *Ptr = U->getOperand(1);
- // Store instruction with index and scale costs 2 Uops.
- // Check the preceding GEP to identify non-const indices.
- if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
- if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
- return TTI::TCC_Basic * 2;
- }
- return TTI::TCC_Basic;
- }
- return BaseT::getUserCost(U, Operands);
+unsigned
+X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return Opcode == Instruction::PHI ? 0 : 1;
+ // Branches are assumed to be predicted.
+ return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
}
// Return an average cost of Gather / Scatter instruction, maybe improved later
-int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
- unsigned Alignment, unsigned AddressSpace) {
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
+ Align Alignment, unsigned AddressSpace) {
assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
- unsigned VF = SrcVTy->getVectorNumElements();
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
// Try to reduce index size from 64 bit (default for GEP)
// to 32. It is essential for VF 16. If the index can't be reduced to 32, the
// operation will use 16 x 64 indices which do not fit in a zmm and needs
// to split. Also check that the base pointer is the same for all lanes,
// and that there's at most one variable index.
- auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+ auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
unsigned IndexSize = DL.getPointerSizeInBits();
- GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (IndexSize < 64 || !GEP)
return IndexSize;
unsigned NumOfVarIndices = 0;
- Value *Ptrs = GEP->getPointerOperand();
+ const Value *Ptrs = GEP->getPointerOperand();
if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
return IndexSize;
for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
if (isa<Constant>(GEP->getOperand(i)))
continue;
Type *IndxTy = GEP->getOperand(i)->getType();
- if (IndxTy->isVectorTy())
- IndxTy = IndxTy->getVectorElementType();
+ if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
+ IndxTy = IndexVTy->getElementType();
if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
!isa<SExtInst>(GEP->getOperand(i))) ||
++NumOfVarIndices > 1)
@@ -3216,21 +3884,21 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
return (unsigned)32;
};
-
// Trying to reduce IndexSize to 32 bits for vector 16.
// By default the IndexSize is equal to pointer size.
unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
? getIndexSizeInBits(Ptr, DL)
: DL.getPointerSizeInBits();
- Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
- IndexSize), VF);
+ auto *IndexVTy = FixedVectorType::get(
+ IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
if (SplitFactor > 1) {
// Handle splitting of vector of pointers
- Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+ auto *SplitSrcTy =
+ FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
AddressSpace);
}
@@ -3241,7 +3909,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
? ST->getGatherOverhead()
: ST->getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ TTI::TCK_RecipThroughput);
}
/// Return the cost of full scalarization of gather / scatter operation.
@@ -3253,25 +3922,29 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
/// AddressSpace - pointer[s] address space.
///
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
- bool VariableMask, unsigned Alignment,
+ bool VariableMask, Align Alignment,
unsigned AddressSpace) {
- unsigned VF = SrcVTy->getVectorNumElements();
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
+ APInt DemandedElts = APInt::getAllOnesValue(VF);
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
int MaskUnpackCost = 0;
if (VariableMask) {
- VectorType *MaskTy =
- VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
- MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+ auto *MaskTy =
+ FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
+ MaskUnpackCost =
+ getScalarizationOverhead(MaskTy, DemandedElts, false, true);
int ScalarCompareCost =
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
- nullptr);
- int BranchCost = getCFInstrCost(Instruction::Br);
+ nullptr, CostKind);
+ int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}
// The cost of the scalar loads/stores.
int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
int InsertExtractCost = 0;
if (Opcode == Instruction::Load)
@@ -3290,21 +3963,28 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
/// Calculate the cost of Gather / Scatter operation
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
- Value *Ptr, bool VariableMask,
- unsigned Alignment) {
+ const Value *Ptr, bool VariableMask,
+ Align Alignment,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) {
+
+ if (CostKind != TTI::TCK_RecipThroughput)
+ return 1;
+
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
- unsigned VF = SrcVTy->getVectorNumElements();
+ unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy && Ptr->getType()->isVectorTy())
- PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+ PtrTy = dyn_cast<PointerType>(
+ cast<VectorType>(Ptr->getType())->getElementType());
assert(PtrTy && "Unexpected type for Ptr argument");
unsigned AddressSpace = PtrTy->getAddressSpace();
bool Scalarize = false;
if ((Opcode == Instruction::Load &&
- !isLegalMaskedGather(SrcVTy, MaybeAlign(Alignment))) ||
+ !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
(Opcode == Instruction::Store &&
- !isLegalMaskedScatter(SrcVTy, MaybeAlign(Alignment))))
+ !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
Scalarize = true;
// Gather / Scatter for vector 2 is not profitable on KNL / SKX
// Vector-4 of gather/scatter instruction does not exist on KNL.
@@ -3337,12 +4017,13 @@ bool X86TTIImpl::canMacroFuseCmp() {
return ST->hasMacroFusion() || ST->hasBranchFusion();
}
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
if (!ST->hasAVX())
return false;
// The backend can't handle a single element vector.
- if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
+ if (isa<VectorType>(DataTy) &&
+ cast<FixedVectorType>(DataTy)->getNumElements() == 1)
return false;
Type *ScalarTy = DataTy->getScalarType();
@@ -3360,7 +4041,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
}
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
return isLegalMaskedLoad(DataType, Alignment);
}
@@ -3407,10 +4088,10 @@ bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
return false;
// The backend can't handle a single element vector.
- if (DataTy->getVectorNumElements() == 1)
+ if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
return false;
- Type *ScalarTy = DataTy->getVectorElementType();
+ Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
return true;
@@ -3427,7 +4108,7 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
return isLegalMaskedExpandLoad(DataTy);
}
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
// Some CPUs have better gather performance than others.
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
// enable gather with a -march.
@@ -3446,8 +4127,8 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
// In this case we can reject non-power-of-2 vectors.
// We also reject single element vectors as the type legalizer can't
// scalarize it.
- if (isa<VectorType>(DataTy)) {
- unsigned NumElts = DataTy->getVectorNumElements();
+ if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
+ unsigned NumElts = DataVTy->getNumElements();
if (NumElts == 1 || !isPowerOf2_32(NumElts))
return false;
}
@@ -3465,7 +4146,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
return IntWidth == 32 || IntWidth == 64;
}
-bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
// AVX2 doesn't support scatter
if (!ST->hasAVX512())
return false;
@@ -3505,11 +4186,22 @@ bool X86TTIImpl::areFunctionArgsABICompatible(
// If we get here, we know the target features match. If one function
// considers 512-bit vectors legal and the other does not, consider them
// incompatible.
- // FIXME Look at the arguments and only consider 512 bit or larger vectors?
const TargetMachine &TM = getTLI()->getTargetMachine();
- return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
- TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
+ if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
+ TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
+ return true;
+
+ // Consider the arguments compatible if they aren't vectors or aggregates.
+ // FIXME: Look at the size of vectors.
+ // FIXME: Look at the element types of aggregates to see if there are vectors.
+ // FIXME: The API of this function seems intended to allow arguments
+ // to be removed from the set, but the caller doesn't check if the set
+ // becomes empty so that may not work in practice.
+ return llvm::none_of(Args, [](Argument *A) {
+ auto *EltTy = cast<PointerType>(A->getType())->getElementType();
+ return EltTy->isVectorTy() || EltTy->isAggregateType();
+ });
}
X86TTIImpl::TTI::MemCmpExpansionOptions
@@ -3517,6 +4209,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = 2;
+ // All GPR and vector loads can be unaligned.
+ Options.AllowOverlappingLoads = true;
if (IsZeroCmp) {
// Only enable vector loads for equality comparison. Right now the vector
// version is not as fast for three way compare (see #33329).
@@ -3524,8 +4218,6 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
- // All GPR and vector loads can be unaligned.
- Options.AllowOverlappingLoads = true;
}
if (ST->is64Bit()) {
Options.LoadSizes.push_back(8);
@@ -3555,24 +4247,22 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
// computing the cost using a generic formula as a function of generic
// shuffles. We therefore use a lookup table instead, filled according to
// the instruction sequences that codegen currently generates.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
// We currently Support only fully-interleaved groups, with no gaps.
// TODO: Support also strided loads (interleaved-groups with gaps).
if (Indices.size() && Indices.size() != Factor)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ CostKind);
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -3584,10 +4274,11 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
// (see MachineValueType.h::getVectorVT()).
if (!LegalVT.isVector())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ CostKind);
- unsigned VF = VecTy->getVectorNumElements() / Factor;
- Type *ScalarTy = VecTy->getVectorElementType();
+ unsigned VF = VecTy->getNumElements() / Factor;
+ Type *ScalarTy = VecTy->getElementType();
// Calculate the number of memory operations (NumOfMemOps), required
// for load/store the VecTy.
@@ -3596,16 +4287,18 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Get the cost of one memory operation.
- Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
- LegalVT.getVectorNumElements());
+ auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+ LegalVT.getVectorNumElements());
unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
- VectorType *VT = VectorType::get(ScalarTy, VF);
+ auto *VT = FixedVectorType::get(ScalarTy, VF);
EVT ETy = TLI->getValueType(DL, VT);
if (!ETy.isSimple())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace,
+ CostKind);
// TODO: Complete for other data-types and strides.
// Each combination of Stride, ElementTy and VF results in a different
@@ -3664,24 +4357,21 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace);
+ Alignment, AddressSpace, CostKind);
}
// Get estimation for interleaved load/store operations and strided load.
// \p Indices contains indices for strided load.
// \p Factor - the factor of interleaving.
// AVX-512 provides 3-src shuffles that significantly reduces the cost.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
+ unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+ ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+ TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
// VecTy for interleave memop is <VF*Factor x Elt>.
@@ -3696,12 +4386,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Get the cost of one memory operation.
- Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
- LegalVT.getVectorNumElements());
+ auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+ LegalVT.getVectorNumElements());
unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace);
+ MaybeAlign(Alignment), AddressSpace,
+ CostKind);
- unsigned VF = VecTy->getVectorNumElements() / Factor;
+ unsigned VF = VecTy->getNumElements() / Factor;
MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
if (Opcode == Instruction::Load) {
@@ -3733,8 +4424,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned NumOfLoadsInInterleaveGrp =
Indices.size() ? Indices.size() : Factor;
- Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
- VecTy->getVectorNumElements() / Factor);
+ auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
+ VecTy->getNumElements() / Factor);
unsigned NumOfResults =
getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
NumOfLoadsInInterleaveGrp;
@@ -3796,15 +4487,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
return Cost;
}
-int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
- unsigned Factor,
- ArrayRef<unsigned> Indices,
- unsigned Alignment,
- unsigned AddressSpace,
- bool UseMaskForCond,
- bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCost(
+ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+ bool UseMaskForCond, bool UseMaskForGaps) {
auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
- Type *EltTy = VecTy->getVectorElementType();
+ Type *EltTy = cast<VectorType>(VecTy)->getElementType();
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
EltTy->isIntegerTy(32) || EltTy->isPointerTy())
return true;
@@ -3813,15 +4501,15 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
return false;
};
if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
- return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- UseMaskForCond, UseMaskForGaps);
+ return getInterleavedMemoryOpCostAVX512(
+ Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+ AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
if (ST->hasAVX2())
- return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- UseMaskForCond, UseMaskForGaps);
+ return getInterleavedMemoryOpCostAVX2(
+ Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+ AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
+ Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
}