aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp1772
1 files changed, 1091 insertions, 681 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 71455237fb61..971c430d73b1 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -129,26 +129,30 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
return 8;
}
-unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
+TypeSize
+X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned PreferVectorWidth = ST->getPreferVectorWidth();
- if (Vector) {
+ switch (K) {
+ case TargetTransformInfo::RGK_Scalar:
+ return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
+ case TargetTransformInfo::RGK_FixedWidthVector:
if (ST->hasAVX512() && PreferVectorWidth >= 512)
- return 512;
+ return TypeSize::getFixed(512);
if (ST->hasAVX() && PreferVectorWidth >= 256)
- return 256;
+ return TypeSize::getFixed(256);
if (ST->hasSSE1() && PreferVectorWidth >= 128)
- return 128;
- return 0;
+ return TypeSize::getFixed(128);
+ return TypeSize::getFixed(0);
+ case TargetTransformInfo::RGK_ScalableVector:
+ return TypeSize::getScalable(0);
}
- if (ST->is64Bit())
- return 64;
-
- return 32;
+ llvm_unreachable("Unsupported register kind");
}
unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
- return getRegisterBitWidth(true);
+ return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedSize();
}
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
@@ -169,21 +173,35 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 2;
}
-int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- TTI::TargetCostKind CostKind,
- TTI::OperandValueKind Op1Info,
- TTI::OperandValueKind Op2Info,
- TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+InstructionCost X86TTIImpl::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+ TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
Op2Info, Opd1PropInfo,
Opd2PropInfo, Args, CxtI);
+
+ // vXi8 multiplications are always promoted to vXi16.
+ if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
+ Ty->getScalarSizeInBits() == 8) {
+ Type *WideVecTy =
+ VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
+ return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
+ TargetTransformInfo::CastContextHint::None,
+ CostKind) +
+ getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
+ TargetTransformInfo::CastContextHint::None,
+ CostKind) +
+ getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
+ Opd1PropInfo, Opd2PropInfo);
+ }
+
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
@@ -203,7 +221,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
static const CostTblEntry SLMCostTable[] = {
{ ISD::MUL, MVT::v4i32, 11 }, // pmulld
{ ISD::MUL, MVT::v8i16, 2 }, // pmullw
- { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
{ ISD::FMUL, MVT::f64, 2 }, // mulsd
{ ISD::FMUL, MVT::v2f64, 4 }, // mulpd
{ ISD::FMUL, MVT::v4f32, 2 }, // mulps
@@ -261,10 +278,9 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// normally expanded to the sequence SRA + SRL + ADD + SRA.
// The OperandValue properties may not be the same as that of the previous
// operation; conservatively assume OP_None.
- int Cost =
+ InstructionCost Cost =
2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
- Op2Info,
- TargetTransformInfo::OP_None,
+ Op2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
Op2Info,
@@ -491,14 +507,22 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
}
static const CostTblEntry AVX512BWShiftCostTable[] = {
+ { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
+ { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
+ { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
+ { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
+ { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
+ { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
+ { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
+ { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
+ { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
+
{ ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v8i16, 1 }, // vpsravw
-
{ ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v16i16, 1 }, // vpsravw
-
{ ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v32i16, 1 }, // vpsravw
@@ -516,6 +540,12 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
{ ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
{ ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
+
+ { ISD::SHL, MVT::v8i32, 1 }, // pslld
+ { ISD::SRL, MVT::v8i32, 1 }, // psrld
+ { ISD::SRA, MVT::v8i32, 1 }, // psrad
+ { ISD::SHL, MVT::v4i64, 1 }, // psllq
+ { ISD::SRL, MVT::v4i64, 1 }, // psrlq
};
if (ST->hasAVX2() &&
@@ -549,9 +579,9 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
}
static const CostTblEntry AVX512DQCostTable[] = {
- { ISD::MUL, MVT::v2i64, 1 },
- { ISD::MUL, MVT::v4i64, 1 },
- { ISD::MUL, MVT::v8i64, 1 }
+ { ISD::MUL, MVT::v2i64, 2 }, // pmullq
+ { ISD::MUL, MVT::v4i64, 2 }, // pmullq
+ { ISD::MUL, MVT::v8i64, 2 } // pmullq
};
// Look for AVX512DQ lowering tricks for custom cases.
@@ -563,10 +593,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
-
- { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
};
// Look for AVX512BW lowering tricks for custom cases.
@@ -575,10 +601,20 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * Entry->Cost;
static const CostTblEntry AVX512CostTable[] = {
+ { ISD::SHL, MVT::v4i32, 1 },
+ { ISD::SRL, MVT::v4i32, 1 },
+ { ISD::SRA, MVT::v4i32, 1 },
+ { ISD::SHL, MVT::v8i32, 1 },
+ { ISD::SRL, MVT::v8i32, 1 },
+ { ISD::SRA, MVT::v8i32, 1 },
{ ISD::SHL, MVT::v16i32, 1 },
{ ISD::SRL, MVT::v16i32, 1 },
{ ISD::SRA, MVT::v16i32, 1 },
+ { ISD::SHL, MVT::v2i64, 1 },
+ { ISD::SRL, MVT::v2i64, 1 },
+ { ISD::SHL, MVT::v4i64, 1 },
+ { ISD::SRL, MVT::v4i64, 1 },
{ ISD::SHL, MVT::v8i64, 1 },
{ ISD::SRL, MVT::v8i64, 1 },
@@ -586,21 +622,28 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
- { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
- { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
+ { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
+ { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
};
if (ST->hasAVX512())
@@ -608,18 +651,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * Entry->Cost;
static const CostTblEntry AVX2ShiftCostTable[] = {
- // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
+ // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
- { ISD::SHL, MVT::v4i32, 1 },
- { ISD::SRL, MVT::v4i32, 1 },
- { ISD::SRA, MVT::v4i32, 1 },
- { ISD::SHL, MVT::v8i32, 1 },
- { ISD::SRL, MVT::v8i32, 1 },
- { ISD::SRA, MVT::v8i32, 1 },
- { ISD::SHL, MVT::v2i64, 1 },
- { ISD::SRL, MVT::v2i64, 1 },
- { ISD::SHL, MVT::v4i64, 1 },
- { ISD::SRL, MVT::v4i64, 1 },
+ { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
+ { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
+ { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
+ { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
+ { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
+ { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
+ { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
+ { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
+ { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
+ { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
};
if (ST->hasAVX512()) {
@@ -634,8 +677,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
TargetTransformInfo::OP_None);
}
- // Look for AVX2 lowering tricks.
- if (ST->hasAVX2()) {
+ // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
+ if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
@@ -733,22 +776,28 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
}
static const CostTblEntry AVX2CostTable[] = {
- { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
- { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
- { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
-
- { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence.
- { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
- { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence.
-
- { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
- { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence.
- { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
+ { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
+
+ { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
+ { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
+ { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
+
+ { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
+ { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
{ ISD::SUB, MVT::v32i8, 1 }, // psubb
{ ISD::ADD, MVT::v32i8, 1 }, // paddb
@@ -759,16 +808,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SUB, MVT::v4i64, 1 }, // psubq
{ ISD::ADD, MVT::v4i64, 1 }, // paddq
- { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
{ ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
- { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+ { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
+ { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
@@ -790,7 +841,9 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
{ ISD::MUL, MVT::v16i16, 4 },
- { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
+ { ISD::MUL, MVT::v4i64, 12 },
+
{ ISD::SUB, MVT::v32i8, 4 },
{ ISD::ADD, MVT::v32i8, 4 },
{ ISD::SUB, MVT::v16i16, 4 },
@@ -800,14 +853,34 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::SUB, MVT::v4i64, 4 },
{ ISD::ADD, MVT::v4i64, 4 },
- // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
- // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
- // Because we believe v4i64 to be a legal type, we must also include the
- // extract+insert in the cost table. Therefore, the cost here is 18
- // instead of 8.
- { ISD::MUL, MVT::v4i64, 18 },
-
- { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
+ { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
+ { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
+ { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
+
+ { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
+ { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
+ { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
+
+ { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
+ { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
+ { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
+
+ { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
+ { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
+
+ { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
+ { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
+ { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
{ ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
@@ -841,6 +914,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
+
+ { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
};
if (ST->hasSSE42())
@@ -848,26 +923,16 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * Entry->Cost;
static const CostTblEntry SSE41CostTable[] = {
- { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence.
- { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split.
- { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
+ { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
{ ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
- { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split
-
- { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
- { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split.
- { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
- { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend.
- { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split.
-
- { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence.
- { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split.
- { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence.
- { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split.
- { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
+
+ { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
+ { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+
+ { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
+ { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
{ ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
};
@@ -879,25 +944,21 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
- { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
+ { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
+ { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
+ { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
{ ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
- { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
+ { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
+ { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
+ { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
- { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split.
- { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence.
- { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence.
- { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend.
- { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split.
+ { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
+ { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
+ { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
+ { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
- { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v8i16, 1 }, // pmullw
{ ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
@@ -907,6 +968,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
+ { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
+
{ ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
{ ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
@@ -922,25 +988,42 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
+ { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
+ { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
+
{ ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
{ ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
-
- { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
- { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
- { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
-
- { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
- { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
- { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
};
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
+ static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+ { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
+ { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
+ };
+
+ if (ST->is64Bit())
+ if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+ { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
+
+ { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
+ { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
+ };
+
+ if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
// registers. The overhead of division is going to dominate most kernels
@@ -949,7 +1032,7 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// to hide "20 cycles" for each lane.
if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
ISD == ISD::UDIV || ISD == ISD::UREM)) {
- int ScalarCost = getArithmeticInstrCost(
+ InstructionCost ScalarCost = getArithmeticInstrCost(
Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
@@ -959,12 +1042,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
}
-int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
- int Index, VectorType *SubTp) {
+InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
+ VectorType *BaseTp,
+ ArrayRef<int> Mask, int Index,
+ VectorType *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
+ Kind = improveShuffleKindFromMask(Kind, Mask);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
if (Kind == TTI::SK_Transpose)
Kind = TTI::SK_PermuteTwoSrc;
@@ -981,7 +1067,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
int NumElts = LT.second.getVectorNumElements();
if ((Index % NumElts) == 0)
return 0;
- std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp);
+ std::pair<InstructionCost, MVT> SubLT =
+ TLI->getTypeLegalizationCost(DL, SubTp);
if (SubLT.second.isVector()) {
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
@@ -1006,8 +1093,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
SubLT.second.getVectorNumElements());
int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
- int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
- ExtractIndex, SubTy);
+ InstructionCost ExtractCost = getShuffleCost(
+ TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
// If the original size is 32-bits or more, we can use pshufd. Otherwise
// if we have SSSE3 we can use pshufb.
@@ -1022,6 +1109,20 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
}
}
+ // Subvector insertions are cheap if the subvectors are aligned.
+ // Note that in general, the insertion starting at the beginning of a vector
+ // isn't free, because we need to preserve the rest of the wide vector.
+ if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
+ int NumElts = LT.second.getVectorNumElements();
+ std::pair<InstructionCost, MVT> SubLT =
+ TLI->getTypeLegalizationCost(DL, SubTp);
+ if (SubLT.second.isVector()) {
+ int NumSubElts = SubLT.second.getVectorNumElements();
+ if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
+ return SubLT.first;
+ }
+ }
+
// Handle some common (illegal) sub-vector types as they are often very cheap
// to shuffle even on targets without PSHUFB.
EVT VT = TLI->getValueType(DL, BaseTp);
@@ -1074,24 +1175,24 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
// Number of source vectors after legalization:
unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Number of destination vectors after legalization:
- unsigned NumOfDests = LT.first;
+ InstructionCost NumOfDests = LT.first;
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
LegalVT.getVectorNumElements());
- unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
- return NumOfShuffles *
- getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+ InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+ return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
+ None, 0, nullptr);
}
- return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
}
// For 2-input shuffles, we must account for splitting the 2 inputs into many.
if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
// We assume that source and destination have the same vector type.
- int NumOfDests = LT.first;
- int NumOfShufflesPerDest = LT.first * 2 - 1;
+ InstructionCost NumOfDests = LT.first;
+ InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
LT.first = NumOfDests * NumOfShufflesPerDest;
}
@@ -1150,6 +1251,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
{TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
{TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
+ {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
+ {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
{TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
@@ -1392,26 +1495,29 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
- return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
+ return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
}
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
- TTI::CastContextHint CCH,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src,
+ TTI::CastContextHint CCH,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// TODO: Allow non-throughput costs that aren't binary.
- auto AdjustCost = [&CostKind](int Cost) {
+ auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
if (CostKind != TTI::TCK_RecipThroughput)
return Cost == 0 ? 0 : 1;
return Cost;
};
+ // The cost tables include both specific, custom (non-legal) src/dst type
+ // conversions and generic, legalized types. We test for customs first, before
+ // falling back to legalization.
// FIXME: Need a better design of the cost table to handle non-simple types of
// potential massive combinations (elem_num x src_type x dst_type).
-
static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
@@ -1446,10 +1552,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm
@@ -1494,11 +1603,15 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 },
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb
+ { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
@@ -1554,33 +1667,40 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
- { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
- { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+ { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
- { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
- { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
- { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
- { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
- { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
+ { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
+ { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
{ ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
- { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 },
- { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
+ { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
+ { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
@@ -1636,12 +1756,12 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
- { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
- { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
@@ -1662,6 +1782,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
+ { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
+ { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
+ { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
// sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
// zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
@@ -1696,14 +1819,31 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
- { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
@@ -1711,20 +1851,17 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
- { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
- { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
-
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 },
- { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
-
- { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
- { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
};
static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -1732,252 +1869,307 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 },
+
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 },
+
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 },
+
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
};
static const TypeConversionCostTblEntry AVXConversionTbl[] = {
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 },
-
- { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
- { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
- { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
-
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
- { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 },
-
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
-
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
- { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 },
- // The generic code to compute the scalar overhead is currently broken.
- // Workaround this limitation by estimating the scalarization overhead
- // here. We have roughly 10 instructions per scalar element.
- // Multiply that by the vector width.
- // FIXME: remove that when PR19268 is fixed.
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
- { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 },
-
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 },
- { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 },
- { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 },
-
- { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 },
- { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 },
- { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 },
- { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 },
- // This node is expanded into scalarized operations but BasicTTI is overly
- // optimistic estimating its cost. It computes 3 per element (one
- // vector-extract, one scalar conversion and one vector-insert). The
- // problem is that the inserts form a read-modify-write chain so latency
- // should be factored in too. Inflating the cost per element by 1.
- { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 },
- { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
+
+ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 },
+ { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 },
+ { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 },
+
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 },
+
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
+ { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 },
+
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 },
+ { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
+ { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 },
+
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 },
+
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 },
{ ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 },
{ ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 },
};
static const TypeConversionCostTblEntry SSE41ConversionTbl[] = {
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
-
- { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
// These truncates end up widening elements.
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
-
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 },
+
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 },
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
-
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 },
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 },
-
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 },
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 },
- { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 },
+
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 },
+
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 },
};
static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
- // These are somewhat magic numbers justified by looking at the output of
- // Intel's IACA, running some kernels and making sure when we take
- // legalization into account the throughput will be overestimated.
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
- { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 },
-
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 },
- { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
-
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 },
- { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
- { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
- { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
- { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 },
-
- { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
-
- { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
- { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
-
+ // These are somewhat magic numbers justified by comparing the
+ // output of llvm-mca for our various supported scheduler models
+ // and basing it off the worst case scenario.
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 },
+ { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 },
+ { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 },
+ { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 },
+
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 },
+ { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 },
+ { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 },
+ { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 },
+
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 },
+ { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 },
+ { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 },
+ { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 },
+ { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 },
+ { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 },
+
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
- { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 },
- { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 },
- { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
- { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 },
- { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
- { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
-
- { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
- { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
- { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 },
- { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 },
- { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 },
- { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 },
- { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 },
- { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 },
+ { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 },
+ { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 },
+ { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 },
+
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 },
+ { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 },
+ { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 },
+ { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 },
+ { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 },
+ { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 },
// These truncates are really widening elements.
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD
@@ -1987,113 +2179,185 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB
- { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
- { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 },
- { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 },
- { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 },
+ { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 },
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 },
- { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 },
- { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
- { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
- { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD
+ { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 },
+ { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD
};
- std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
- std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst);
-
- if (ST->hasSSE2() && !ST->hasAVX()) {
- if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
- LTDest.second, LTSrc.second))
- return AdjustCost(LTSrc.first * Entry->Cost);
- }
-
+ // Attempt to map directly to (simple) MVT types to let us match custom entries.
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
// The function getSimpleVT only handles simple value types.
- if (!SrcTy.isSimple() || !DstTy.isSimple())
- return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind));
-
- MVT SimpleSrcTy = SrcTy.getSimpleVT();
- MVT SimpleDstTy = DstTy.getSimpleVT();
+ if (SrcTy.isSimple() && DstTy.isSimple()) {
+ MVT SimpleSrcTy = SrcTy.getSimpleVT();
+ MVT SimpleDstTy = DstTy.getSimpleVT();
+
+ if (ST->useAVX512Regs()) {
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
- if (ST->useAVX512Regs()) {
if (ST->hasBWI())
- if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
return AdjustCost(Entry->Cost);
if (ST->hasDQI())
- if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy))
return AdjustCost(Entry->Cost);
if (ST->hasAVX512())
- if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
+ if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
SimpleDstTy, SimpleSrcTy))
return AdjustCost(Entry->Cost);
+
+ if (ST->hasAVX2()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasAVX()) {
+ if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasSSE41()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+
+ if (ST->hasSSE2()) {
+ if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
+ SimpleDstTy, SimpleSrcTy))
+ return AdjustCost(Entry->Cost);
+ }
+ }
+
+ // Fall back to legalized types.
+ std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src);
+ std::pair<InstructionCost, MVT> LTDest =
+ TLI->getTypeLegalizationCost(DL, Dst);
+
+ if (ST->useAVX512Regs()) {
+ if (ST->hasBWI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+
+ if (ST->hasDQI())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+
+ if (ST->hasAVX512())
+ if (const auto *Entry = ConvertCostTableLookup(
+ AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
}
if (ST->hasBWI())
if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
if (ST->hasDQI())
if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
if (ST->hasAVX512())
if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
- if (ST->hasAVX2()) {
+ if (ST->hasAVX2())
if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
- }
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
- if (ST->hasAVX()) {
+ if (ST->hasAVX())
if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
- }
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
- if (ST->hasSSE41()) {
+ if (ST->hasSSE41())
if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
- }
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
- if (ST->hasSSE2()) {
+ if (ST->hasSSE2())
if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
- SimpleDstTy, SimpleSrcTy))
- return AdjustCost(Entry->Cost);
+ LTDest.second, LTSrc.second))
+ return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost);
+
+ // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for
+ // sitofp.
+ if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
+ 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) {
+ Type *ExtSrc = Src->getWithNewBitWidth(32);
+ unsigned ExtOpc =
+ (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt;
+
+ // For scalar loads the extend would be free.
+ InstructionCost ExtCost = 0;
+ if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0))))
+ ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind);
+
+ return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc,
+ TTI::CastContextHint::None, CostKind);
+ }
+
+ // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi
+ // i32.
+ if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) &&
+ 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) {
+ Type *TruncDst = Dst->getWithNewBitWidth(32);
+ return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) +
+ getCastInstrCost(Instruction::Trunc, Dst, TruncDst,
+ TTI::CastContextHint::None, CostKind);
}
return AdjustCost(
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
}
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
- CmpInst::Predicate VecPred,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy,
+ CmpInst::Predicate VecPred,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
I);
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
@@ -2279,8 +2543,9 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
-int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
- const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
+InstructionCost
+X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
// Costs should match the codegen from:
// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
@@ -2312,6 +2577,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::BITREVERSE, MVT::v16i32, 5 },
{ ISD::BITREVERSE, MVT::v32i16, 5 },
{ ISD::BITREVERSE, MVT::v64i8, 5 },
+ { ISD::BSWAP, MVT::v8i64, 1 },
+ { ISD::BSWAP, MVT::v16i32, 1 },
+ { ISD::BSWAP, MVT::v32i16, 1 },
{ ISD::CTLZ, MVT::v8i64, 23 },
{ ISD::CTLZ, MVT::v16i32, 22 },
{ ISD::CTLZ, MVT::v32i16, 18 },
@@ -2352,6 +2620,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::BITREVERSE, MVT::v16i32, 24 },
{ ISD::BITREVERSE, MVT::v32i16, 10 },
{ ISD::BITREVERSE, MVT::v64i8, 10 },
+ { ISD::BSWAP, MVT::v8i64, 4 },
+ { ISD::BSWAP, MVT::v16i32, 4 },
+ { ISD::BSWAP, MVT::v32i16, 4 },
{ ISD::CTLZ, MVT::v8i64, 29 },
{ ISD::CTLZ, MVT::v16i32, 35 },
{ ISD::CTLZ, MVT::v32i16, 28 },
@@ -2670,6 +2941,7 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::ABS, MVT::i64, 2 }, // SUB+CMOV
{ ISD::BITREVERSE, MVT::i64, 14 },
+ { ISD::BSWAP, MVT::i64, 1 },
{ ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH
{ ISD::CTPOP, MVT::i64, 10 },
@@ -2683,6 +2955,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },
+ { ISD::BSWAP, MVT::i32, 1 },
+ { ISD::BSWAP, MVT::i16, 1 }, // ROL
{ ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV
{ ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV
@@ -2782,7 +3056,7 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
if (ISD != ISD::DELETED_NODE) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy);
MVT MTy = LT.second;
// Attempt to lookup cost.
@@ -2802,7 +3076,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
return LT.first * Cost;
}
- auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost,
+ auto adjustTableCost = [](const CostTblEntry &Entry,
+ InstructionCost LegalizationCost,
FastMathFlags FMF) {
// If there are no NANs to deal with, then these are reduced to a
// single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we
@@ -2893,6 +3168,17 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
return adjustTableCost(*Entry, LT.first, ICA.getFlags());
}
+ if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) {
+ if (const Instruction *II = ICA.getInst()) {
+ if (II->hasOneUse() && isa<StoreInst>(II->user_back()))
+ return TTI::TCC_Free;
+ if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) {
+ if (LI->hasOneUse())
+ return TTI::TCC_Free;
+ }
+ }
+ }
+
// TODO - add BMI (TZCNT) scalar handling
if (ST->is64Bit())
@@ -2906,8 +3192,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+ TTI::TargetCostKind CostKind) {
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
@@ -2983,7 +3270,8 @@ int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
if (ISD != ISD::DELETED_NODE) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
+ std::pair<InstructionCost, MVT> LT =
+ TLI->getTypeLegalizationCost(DL, RetTy);
MVT MTy = LT.second;
// Attempt to lookup cost.
@@ -3006,7 +3294,8 @@ int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
-int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) {
static const CostTblEntry SLMCostTbl[] = {
{ ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
{ ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
@@ -3018,10 +3307,40 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
Type *ScalarType = Val->getScalarType();
int RegisterFileMoveCost = 0;
+ // Non-immediate extraction/insertion can be handled as a sequence of
+ // aliased loads+stores via the stack.
+ if (Index == -1U && (Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::InsertElement)) {
+ // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns:
+ // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
+
+ // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling.
+ assert(isa<FixedVectorType>(Val) && "Fixed vector type expected");
+ Align VecAlign = DL.getPrefTypeAlign(Val);
+ Align SclAlign = DL.getPrefTypeAlign(ScalarType);
+
+ // Extract - store vector to stack, load scalar.
+ if (Opcode == Instruction::ExtractElement) {
+ return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput) +
+ getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput);
+ }
+ // Insert - store vector to stack, store scalar, load vector.
+ if (Opcode == Instruction::InsertElement) {
+ return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput) +
+ getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput) +
+ getMemoryOpCost(Instruction::Load, Val, VecAlign, 0,
+ TTI::TargetCostKind::TCK_RecipThroughput);
+ }
+ }
+
if (Index != -1U && (Opcode == Instruction::ExtractElement ||
Opcode == Instruction::InsertElement)) {
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
// This type is legalized to a scalar type.
if (!LT.second.isVector())
@@ -3079,13 +3398,14 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
// subvector move(s).
// If the vector type is already less than 128-bits then don't reduce it.
// TODO: Under what circumstances should we shuffle using the full width?
- int ShuffleCost = 1;
+ InstructionCost ShuffleCost = 1;
if (Opcode == Instruction::InsertElement) {
auto *SubTy = cast<VectorType>(Val);
EVT VT = TLI->getValueType(DL, Val);
if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
SubTy = FixedVectorType::get(ScalarType, SubNumElts);
- ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+ ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy);
}
int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
@@ -3099,15 +3419,16 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}
-unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
- const APInt &DemandedElts,
- bool Insert, bool Extract) {
- unsigned Cost = 0;
+InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
+ const APInt &DemandedElts,
+ bool Insert,
+ bool Extract) {
+ InstructionCost Cost = 0;
// For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
// cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
if (Insert) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MScalarTy = LT.second.getScalarType();
if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
@@ -3131,8 +3452,10 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
// Case#2: inserting into 5th index needs extracti128 + vpinsrd +
// inserti128.
// Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128.
- unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first;
- unsigned NumElts = LT.second.getVectorNumElements() * LT.first;
+ const int CostValue = *LT.first.getValue();
+ assert(CostValue >= 0 && "Negative cost!");
+ unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue;
+ unsigned NumElts = LT.second.getVectorNumElements() * CostValue;
APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts);
unsigned Scale = NumElts / Num128Lanes;
// We iterate each 128-lane, and check if we need a
@@ -3182,10 +3505,11 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
return Cost;
}
-int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
- MaybeAlign Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind,
- const Instruction *I) {
+InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
// TODO: Handle other cost kinds.
if (CostKind != TTI::TCK_RecipThroughput) {
if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
@@ -3199,57 +3523,146 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
return TTI::TCC_Basic;
}
- // Handle non-power-of-two vectors such as <3 x float>
- if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
- unsigned NumElem = VTy->getNumElements();
-
- // Handle a few common cases:
- // <3 x float>
- if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
- // Cost = 64 bit store + extract + 32 bit store.
- return 3;
-
- // <3 x double>
- if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
- // Cost = 128 bit store + unpack + 64 bit store.
- return 3;
-
- // Assume that all other non-power-of-two numbers are scalarized.
- if (!isPowerOf2_32(NumElem)) {
- APInt DemandedElts = APInt::getAllOnesValue(NumElem);
- int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
- AddressSpace, CostKind);
- int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
- Opcode == Instruction::Load,
- Opcode == Instruction::Store);
- return NumElem * Cost + SplitCost;
- }
- }
-
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
// Type legalization can't handle structs
- if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
- assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
- "Invalid Opcode");
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+
+ auto *VTy = dyn_cast<FixedVectorType>(Src);
+
+ // Handle the simple case of non-vectors.
+ // NOTE: this assumes that legalization never creates vector from scalars!
+ if (!VTy || !LT.second.isVector())
+ // Each load/store unit costs 1.
+ return LT.first * 1;
+
+ bool IsLoad = Opcode == Instruction::Load;
- // Each load/store unit costs 1.
- int Cost = LT.first * 1;
+ Type *EltTy = VTy->getElementType();
- // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
- // proxy for a double-pumped AVX memory interface such as on Sandybridge.
- if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
- Cost *= 2;
+ const int EltTyBits = DL.getTypeSizeInBits(EltTy);
+
+ InstructionCost Cost = 0;
+
+ // Source of truth: how many elements were there in the original IR vector?
+ const unsigned SrcNumElt = VTy->getNumElements();
+
+ // How far have we gotten?
+ int NumEltRemaining = SrcNumElt;
+ // Note that we intentionally capture by-reference, NumEltRemaining changes.
+ auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
+
+ const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
+
+ // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
+ const unsigned XMMBits = 128;
+ if (XMMBits % EltTyBits != 0)
+ // Vector size must be a multiple of the element size. I.e. no padding.
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ const int NumEltPerXMM = XMMBits / EltTyBits;
+
+ auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
+
+ for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
+ NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
+ // How many elements would a single op deal with at once?
+ if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
+ // Vector size must be a multiple of the element size. I.e. no padding.
+ return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+ CostKind);
+ int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
+
+ assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
+ assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
+ (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
+ "Unless we haven't halved the op size yet, "
+ "we have less than two op's sized units of work left.");
+
+ auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
+ ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
+ : XMMVecTy;
+
+ assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
+ "After halving sizes, the vector elt count is no longer a multiple "
+ "of number of elements per operation?");
+ auto *CoalescedVecTy =
+ CurrNumEltPerOp == 1
+ ? CurrVecTy
+ : FixedVectorType::get(
+ IntegerType::get(Src->getContext(),
+ EltTyBits * CurrNumEltPerOp),
+ CurrVecTy->getNumElements() / CurrNumEltPerOp);
+ assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
+ DL.getTypeSizeInBits(CurrVecTy) &&
+ "coalesciing elements doesn't change vector width.");
+
+ while (NumEltRemaining > 0) {
+ assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
+
+ // Can we use this vector size, as per the remaining element count?
+ // Iff the vector is naturally aligned, we can do a wide load regardless.
+ if (NumEltRemaining < CurrNumEltPerOp &&
+ (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
+ CurrOpSizeBytes != 1)
+ break; // Try smalled vector size.
+
+ bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
+
+ // If we have fully processed the previous reg, we need to replenish it.
+ if (SubVecEltsLeft == 0) {
+ SubVecEltsLeft += CurrVecTy->getNumElements();
+ // And that's free only for the 0'th subvector of a legalized vector.
+ if (!Is0thSubVec)
+ Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
+ : TTI::ShuffleKind::SK_ExtractSubvector,
+ VTy, None, NumEltDone(), CurrVecTy);
+ }
+
+ // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
+ // for smaller widths (32/16/8) we have to insert/extract them separately.
+ // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
+ // but let's pretend that it is also true for 16/8 bit wide ops...)
+ if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
+ int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
+ assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
+ int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
+ APInt DemandedElts =
+ APInt::getBitsSet(CoalescedVecTy->getNumElements(),
+ CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
+ assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
+ Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
+ !IsLoad);
+ }
+
+ // This isn't exactly right. We're using slow unaligned 32-byte accesses
+ // as a proxy for a double-pumped AVX memory interface such as on
+ // Sandybridge.
+ if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
+ Cost += 2;
+ else
+ Cost += 1;
+
+ SubVecEltsLeft -= CurrNumEltPerOp;
+ NumEltRemaining -= CurrNumEltPerOp;
+ Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
+ }
+ }
+
+ assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
return Cost;
}
-int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
- Align Alignment, unsigned AddressSpace,
- TTI::TargetCostKind CostKind) {
+InstructionCost
+X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
+ unsigned AddressSpace,
+ TTI::TargetCostKind CostKind) {
bool IsLoad = (Instruction::Load == Opcode);
bool IsStore = (Instruction::Store == Opcode);
@@ -3262,40 +3675,39 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
auto *MaskTy =
FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
- (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
- !isPowerOf2_32(NumElem)) {
+ (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) {
// Scalarization
APInt DemandedElts = APInt::getAllOnesValue(NumElem);
- int MaskSplitCost =
+ InstructionCost MaskSplitCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
- int ScalarCompareCost = getCmpSelInstrCost(
+ InstructionCost ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
- int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
- int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
- int ValueSplitCost =
+ InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
+ InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
+ InstructionCost ValueSplitCost =
getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
- int MemopCost =
+ InstructionCost MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace, CostKind);
return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
}
// Legalize the type.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy);
auto VT = TLI->getValueType(DL, SrcVTy);
- int Cost = 0;
+ InstructionCost Cost = 0;
if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
- // Promotion requires expand/truncate for data and a shuffle for mask.
- Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) +
- getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
+ // Promotion requires extend/truncate for data and a shuffle for mask.
+ Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) +
+ getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr);
- else if (LT.second.getVectorNumElements() > NumElem) {
+ else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
LT.second.getVectorNumElements());
// Expanding requires fill mask with zeroes
- Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
+ Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy);
}
// Pre-AVX512 - each maskmov load costs 2 + store costs ~8.
@@ -3306,8 +3718,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
return Cost + LT.first;
}
-int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
- const SCEV *Ptr) {
+InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty,
+ ScalarEvolution *SE,
+ const SCEV *Ptr) {
// Address computations in vectorized code with non-consecutive addresses will
// likely result in more instructions compared to scalar code where the
// computation can more often be merged into the index mode. The resulting
@@ -3331,12 +3744,12 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
-int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
- bool IsPairwise,
- TTI::TargetCostKind CostKind) {
- // Just use the default implementation for pair reductions.
- if (IsPairwise)
- return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
+InstructionCost
+X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+ Optional<FastMathFlags> FMF,
+ TTI::TargetCostKind CostKind) {
+ if (TTI::requiresOrderedReduction(FMF))
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
@@ -3348,6 +3761,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
static const CostTblEntry SSE2CostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
+ { ISD::FADD, MVT::v2f32, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
{ ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6".
{ ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32
@@ -3394,13 +3808,23 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
return Entry->Cost;
}
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
auto *ValVTy = cast<FixedVectorType>(ValTy);
- unsigned ArithmeticCost = 0;
+ // Special case: vXi8 mul reductions are performed as vXi16.
+ if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
+ auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
+ auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
+ return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
+ TargetTransformInfo::CastContextHint::None,
+ CostKind) +
+ getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind);
+ }
+
+ InstructionCost ArithmeticCost = 0;
if (LT.first != 1 && MTy.isVector() &&
MTy.getVectorNumElements() < ValVTy->getNumElements()) {
// Type needs to be split. We need LT.first - 1 arithmetic ops.
@@ -3470,7 +3894,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
// Handle bool allof/anyof patterns.
if (ValVTy->getElementType()->isIntegerTy(1)) {
- unsigned ArithmeticCost = 0;
+ InstructionCost ArithmeticCost = 0;
if (LT.first != 1 && MTy.isVector() &&
MTy.getVectorNumElements() < ValVTy->getNumElements()) {
// Type needs to be split. We need LT.first - 1 arithmetic ops.
@@ -3493,8 +3917,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
return ArithmeticCost + Entry->Cost;
- return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
- CostKind);
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
}
unsigned NumVecElts = ValVTy->getNumElements();
@@ -3503,10 +3926,9 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
// Special case power of 2 reductions where the scalar type isn't changed
// by type legalization.
if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
- return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
- CostKind);
+ return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind);
- unsigned ReductionCost = 0;
+ InstructionCost ReductionCost = 0;
auto *Ty = ValVTy;
if (LT.first != 1 && MTy.isVector() &&
@@ -3529,7 +3951,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
if (Size > 128) {
auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
ReductionCost +=
- getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
Ty = SubTy;
} else if (Size == 128) {
// Reducing from 128 bits is a permute of v2f64/v2i64.
@@ -3541,7 +3963,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
ShufTy =
FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
ReductionCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else if (Size == 64) {
// Reducing from 64 bits is a shuffle of v4f32/v4i32.
FixedVectorType *ShufTy;
@@ -3552,7 +3974,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
ShufTy =
FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
ReductionCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else {
// Reducing from smaller size is a shift by immediate.
auto *ShiftTy = FixedVectorType::get(
@@ -3572,8 +3994,9 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}
-int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy,
+ bool IsUnsigned) {
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
MVT MTy = LT.second;
@@ -3691,21 +4114,19 @@ int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
// Otherwise fall back to cmp+select.
- return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
- CostKind) +
- getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ InstructionCost Result =
+ getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE,
+ CostKind) +
+ getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ return Result;
}
-int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
- bool IsPairwise, bool IsUnsigned,
- TTI::TargetCostKind CostKind) {
- // Just use the default implementation for pair reductions.
- if (IsPairwise)
- return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
- CostKind);
-
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+InstructionCost
+X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
+ bool IsUnsigned,
+ TTI::TargetCostKind CostKind) {
+ std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
MVT MTy = LT.second;
@@ -3785,7 +4206,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
unsigned NumVecElts = ValVTy->getNumElements();
auto *Ty = ValVTy;
- unsigned MinMaxCost = 0;
+ InstructionCost MinMaxCost = 0;
if (LT.first != 1 && MTy.isVector() &&
MTy.getVectorNumElements() < ValVTy->getNumElements()) {
// Type needs to be split. We need LT.first - 1 operations ops.
@@ -3820,8 +4241,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
// by type legalization.
if (!isPowerOf2_32(ValVTy->getNumElements()) ||
ScalarSize != MTy.getScalarSizeInBits())
- return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
- CostKind);
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind);
// Now handle reduction with the legal type, taking into account size changes
// at each level.
@@ -3833,7 +4253,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
if (Size > 128) {
auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
MinMaxCost +=
- getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+ getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy);
Ty = SubTy;
} else if (Size == 128) {
// Reducing from 128 bits is a permute of v2f64/v2i64.
@@ -3844,7 +4264,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
else
ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
MinMaxCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else if (Size == 64) {
// Reducing from 64 bits is a shuffle of v4f32/v4i32.
FixedVectorType *ShufTy;
@@ -3853,7 +4273,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
else
ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
MinMaxCost +=
- getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+ getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr);
} else {
// Reducing from smaller size is a shift by immediate.
auto *ShiftTy = FixedVectorType::get(
@@ -3878,7 +4298,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
-int X86TTIImpl::getIntImmCost(int64_t Val) {
+InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) {
if (Val == 0)
return TTI::TCC_Free;
@@ -3888,8 +4308,8 @@ int X86TTIImpl::getIntImmCost(int64_t Val) {
return 2 * TTI::TCC_Basic;
}
-int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3913,20 +4333,20 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
- int Cost = 0;
+ InstructionCost Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialize the constant.
- return std::max(1, Cost);
+ return std::max<InstructionCost>(1, Cost);
}
-int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind,
- Instruction *Inst) {
+InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind,
+ Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -4013,7 +4433,7 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
if (Idx == ImmIdx) {
int NumConstants = divideCeil(BitSize, 64);
- int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
+ InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
@@ -4022,9 +4442,9 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
- const APInt &Imm, Type *Ty,
- TTI::TargetCostKind CostKind) {
+InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty,
+ TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -4058,12 +4478,13 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
-unsigned
-X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode,
+ TTI::TargetCostKind CostKind,
+ const Instruction *I) {
if (CostKind != TTI::TCK_RecipThroughput)
return Opcode == Instruction::PHI ? 0 : 1;
// Branches are assumed to be predicted.
- return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+ return 0;
}
int X86TTIImpl::getGatherOverhead() const {
@@ -4088,8 +4509,9 @@ int X86TTIImpl::getScatterOverhead() const {
// Return an average cost of Gather / Scatter instruction, maybe improved later.
// FIXME: Add TargetCostKind support.
-int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
- Align Alignment, unsigned AddressSpace) {
+InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy,
+ const Value *Ptr, Align Alignment,
+ unsigned AddressSpace) {
assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
@@ -4131,9 +4553,12 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
auto *IndexVTy = FixedVectorType::get(
IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
- std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
- std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
- int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
+ std::pair<InstructionCost, MVT> IdxsLT =
+ TLI->getTypeLegalizationCost(DL, IndexVTy);
+ std::pair<InstructionCost, MVT> SrcLT =
+ TLI->getTypeLegalizationCost(DL, SrcVTy);
+ InstructionCost::CostType SplitFactor =
+ *std::max(IdxsLT.first, SrcLT.first).getValue();
if (SplitFactor > 1) {
// Handle splitting of vector of pointers
auto *SplitSrcTy =
@@ -4161,32 +4586,32 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
/// AddressSpace - pointer[s] address space.
///
/// FIXME: Add TargetCostKind support.
-int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
- bool VariableMask, Align Alignment,
- unsigned AddressSpace) {
+InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
+ bool VariableMask, Align Alignment,
+ unsigned AddressSpace) {
unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
APInt DemandedElts = APInt::getAllOnesValue(VF);
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- int MaskUnpackCost = 0;
+ InstructionCost MaskUnpackCost = 0;
if (VariableMask) {
auto *MaskTy =
FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
MaskUnpackCost =
getScalarizationOverhead(MaskTy, DemandedElts, false, true);
- int ScalarCompareCost = getCmpSelInstrCost(
+ InstructionCost ScalarCompareCost = getCmpSelInstrCost(
Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
CmpInst::BAD_ICMP_PREDICATE, CostKind);
- int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
+ InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
}
// The cost of the scalar loads/stores.
- int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- MaybeAlign(Alignment), AddressSpace,
- CostKind);
+ InstructionCost MemoryOpCost =
+ VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
+ MaybeAlign(Alignment), AddressSpace, CostKind);
- int InsertExtractCost = 0;
+ InstructionCost InsertExtractCost = 0;
if (Opcode == Instruction::Load)
for (unsigned i = 0; i < VF; ++i)
// Add the cost of inserting each scalar load into the vector
@@ -4202,11 +4627,10 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
}
/// Calculate the cost of Gather / Scatter operation
-int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
- const Value *Ptr, bool VariableMask,
- Align Alignment,
- TTI::TargetCostKind CostKind,
- const Instruction *I = nullptr) {
+InstructionCost X86TTIImpl::getGatherScatterOpCost(
+ unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask,
+ Align Alignment, TTI::TargetCostKind CostKind,
+ const Instruction *I = nullptr) {
if (CostKind != TTI::TCK_RecipThroughput) {
if ((Opcode == Instruction::Load &&
isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
@@ -4218,7 +4642,6 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
}
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
- unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy && Ptr->getType()->isVectorTy())
PtrTy = dyn_cast<PointerType>(
@@ -4226,22 +4649,10 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
assert(PtrTy && "Unexpected type for Ptr argument");
unsigned AddressSpace = PtrTy->getAddressSpace();
- bool Scalarize = false;
if ((Opcode == Instruction::Load &&
!isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
(Opcode == Instruction::Store &&
!isLegalMaskedScatter(SrcVTy, Align(Alignment))))
- Scalarize = true;
- // Gather / Scatter for vector 2 is not profitable on KNL / SKX
- // Vector-4 of gather/scatter instruction does not exist on KNL.
- // We can extend it to 8 elements, but zeroing upper bits of
- // the mask vector will add more instructions. Right now we give the scalar
- // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
- // is better in the VariableMask case.
- if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
- Scalarize = true;
-
- if (Scalarize)
return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
AddressSpace);
@@ -4377,6 +4788,14 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
unsigned NumElts = DataVTy->getNumElements();
if (NumElts == 1)
return false;
+ // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+ // Vector-4 of gather/scatter instruction does not exist on KNL.
+ // We can extend it to 8 elements, but zeroing upper bits of
+ // the mask vector will add more instructions. Right now we give the scalar
+ // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter
+ // instruction is better in the VariableMask case.
+ if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())))
+ return false;
}
Type *ScalarTy = DataTy->getScalarType();
if (ScalarTy->isPointerTy())
@@ -4493,7 +4912,7 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
// computing the cost using a generic formula as a function of generic
// shuffles. We therefore use a lookup table instead, filled according to
// the instruction sequences that codegen currently generates.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
+InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
@@ -4507,8 +4926,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
// TODO: Support also strided loads (interleaved-groups with gaps).
if (Indices.size() && Indices.size() != Factor)
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- CostKind);
+ Alignment, AddressSpace, CostKind);
// VecTy for interleave memop is <VF*Factor x Elt>.
// So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -4520,86 +4938,78 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
// (see MachineValueType.h::getVectorVT()).
if (!LegalVT.isVector())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- CostKind);
+ Alignment, AddressSpace, CostKind);
unsigned VF = VecTy->getNumElements() / Factor;
Type *ScalarTy = VecTy->getElementType();
+ // Deduplicate entries, model floats/pointers as appropriately-sized integers.
+ if (!ScalarTy->isIntegerTy())
+ ScalarTy =
+ Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy));
- // Calculate the number of memory operations (NumOfMemOps), required
- // for load/store the VecTy.
- unsigned VecTySize = DL.getTypeStoreSize(VecTy);
- unsigned LegalVTSize = LegalVT.getStoreSize();
- unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
-
- // Get the cost of one memory operation.
- auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
- LegalVT.getVectorNumElements());
- unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace,
- CostKind);
+ // Get the cost of all the memory operations.
+ InstructionCost MemOpCosts = getMemoryOpCost(
+ Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind);
auto *VT = FixedVectorType::get(ScalarTy, VF);
EVT ETy = TLI->getValueType(DL, VT);
if (!ETy.isSimple())
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
- Alignment, AddressSpace,
- CostKind);
+ Alignment, AddressSpace, CostKind);
// TODO: Complete for other data-types and strides.
- // Each combination of Stride, ElementTy and VF results in a different
+ // Each combination of Stride, element bit width and VF results in a different
// sequence; The cost tables are therefore accessed with:
- // Factor (stride) and VectorType=VFxElemType.
+ // Factor (stride) and VectorType=VFxiN.
// The Cost accounts only for the shuffle sequence;
// The cost of the loads/stores is accounted for separately.
//
static const CostTblEntry AVX2InterleavedLoadTbl[] = {
- { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
- { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
-
- { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
- { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
- { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
- { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
- { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
- { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
-
- { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
- { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
- { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
- { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
- { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
-
- { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
+ {2, MVT::v4i64, 6}, // (load 8i64 and) deinterleave into 2 x 4i64
+
+ {3, MVT::v2i8, 10}, // (load 6i8 and) deinterleave into 3 x 2i8
+ {3, MVT::v4i8, 4}, // (load 12i8 and) deinterleave into 3 x 4i8
+ {3, MVT::v8i8, 9}, // (load 24i8 and) deinterleave into 3 x 8i8
+ {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
+ {3, MVT::v32i8, 13}, // (load 96i8 and) deinterleave into 3 x 32i8
+
+ {3, MVT::v8i32, 17}, // (load 24i32 and) deinterleave into 3 x 8i32
+
+ {4, MVT::v2i8, 12}, // (load 8i8 and) deinterleave into 4 x 2i8
+ {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8
+ {4, MVT::v8i8, 20}, // (load 32i8 and) deinterleave into 4 x 8i8
+ {4, MVT::v16i8, 39}, // (load 64i8 and) deinterleave into 4 x 16i8
+ {4, MVT::v32i8, 80}, // (load 128i8 and) deinterleave into 4 x 32i8
+
+ {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32
};
static const CostTblEntry AVX2InterleavedStoreTbl[] = {
- { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
- { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
-
- { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
- { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
- { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
- { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
- { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
-
- { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
- { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
- { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
- { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
- { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
+ {2, MVT::v4i64, 6}, // interleave 2 x 4i64 into 8i64 (and store)
+
+ {3, MVT::v2i8, 7}, // interleave 3 x 2i8 into 6i8 (and store)
+ {3, MVT::v4i8, 8}, // interleave 3 x 4i8 into 12i8 (and store)
+ {3, MVT::v8i8, 11}, // interleave 3 x 8i8 into 24i8 (and store)
+ {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
+ {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
+
+ {4, MVT::v2i8, 12}, // interleave 4 x 2i8 into 8i8 (and store)
+ {4, MVT::v4i8, 9}, // interleave 4 x 4i8 into 16i8 (and store)
+ {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
+ {4, MVT::v16i8, 10}, // interleave 4 x 16i8 into 64i8 (and store)
+ {4, MVT::v32i8, 12} // interleave 4 x 32i8 into 128i8 (and store)
};
if (Opcode == Instruction::Load) {
if (const auto *Entry =
CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT()))
- return NumOfMemOps * MemOpCost + Entry->Cost;
+ return MemOpCosts + Entry->Cost;
} else {
assert(Opcode == Instruction::Store &&
"Expected Store Instruction at this point");
if (const auto *Entry =
CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
- return NumOfMemOps * MemOpCost + Entry->Cost;
+ return MemOpCosts + Entry->Cost;
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -4610,7 +5020,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
// \p Indices contains indices for strided load.
// \p Factor - the factor of interleaving.
// AVX-512 provides 3-src shuffles that significantly reduces the cost.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
+InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
@@ -4634,9 +5044,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
// Get the cost of one memory operation.
auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
LegalVT.getVectorNumElements());
- unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
- MaybeAlign(Alignment), AddressSpace,
- CostKind);
+ InstructionCost MemOpCost = getMemoryOpCost(
+ Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind);
unsigned VF = VecTy->getNumElements() / Factor;
MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
@@ -4665,14 +5074,14 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
TTI::ShuffleKind ShuffleKind =
(NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
- unsigned ShuffleCost =
- getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+ InstructionCost ShuffleCost =
+ getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr);
unsigned NumOfLoadsInInterleaveGrp =
Indices.size() ? Indices.size() : Factor;
auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
VecTy->getNumElements() / Factor);
- unsigned NumOfResults =
+ InstructionCost NumOfResults =
getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
NumOfLoadsInInterleaveGrp;
@@ -4688,12 +5097,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
// The SK_MergeTwoSrc shuffle clobbers one of src operands.
// When we have more than one destination, we need additional instructions
// to keep sources.
- unsigned NumOfMoves = 0;
+ InstructionCost NumOfMoves = 0;
if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
- int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
- NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+ InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+ NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
return Cost;
}
@@ -4721,19 +5130,20 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
// There is no strided stores meanwhile. And store can't be folded in
// shuffle.
unsigned NumOfSources = Factor; // The number of values to be merged.
- unsigned ShuffleCost =
- getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+ InstructionCost ShuffleCost =
+ getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr);
unsigned NumOfShufflesPerStore = NumOfSources - 1;
// The SK_MergeTwoSrc shuffle clobbers one of src operands.
// We need additional instructions to keep sources.
unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
- int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
- NumOfMoves;
+ InstructionCost Cost =
+ NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+ NumOfMoves;
return Cost;
}
-int X86TTIImpl::getInterleavedMemoryOpCost(
+InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) {