diff options
Diffstat (limited to 'llvm/lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r-- | llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 1772 |
1 files changed, 1091 insertions, 681 deletions
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 71455237fb61..971c430d73b1 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -129,26 +129,30 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { return 8; } -unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const { +TypeSize +X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { unsigned PreferVectorWidth = ST->getPreferVectorWidth(); - if (Vector) { + switch (K) { + case TargetTransformInfo::RGK_Scalar: + return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); + case TargetTransformInfo::RGK_FixedWidthVector: if (ST->hasAVX512() && PreferVectorWidth >= 512) - return 512; + return TypeSize::getFixed(512); if (ST->hasAVX() && PreferVectorWidth >= 256) - return 256; + return TypeSize::getFixed(256); if (ST->hasSSE1() && PreferVectorWidth >= 128) - return 128; - return 0; + return TypeSize::getFixed(128); + return TypeSize::getFixed(0); + case TargetTransformInfo::RGK_ScalableVector: + return TypeSize::getScalable(0); } - if (ST->is64Bit()) - return 64; - - return 32; + llvm_unreachable("Unsupported register kind"); } unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { - return getRegisterBitWidth(true); + return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedSize(); } unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { @@ -169,21 +173,35 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - TTI::TargetCostKind CostKind, - TTI::OperandValueKind Op1Info, - TTI::OperandValueKind Op2Info, - TTI::OperandValueProperties Opd1PropInfo, - TTI::OperandValueProperties Opd2PropInfo, - ArrayRef<const Value *> Args, - const Instruction *CxtI) { +InstructionCost X86TTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, + TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, + TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, + const Instruction *CxtI) { // TODO: Handle more cost kinds. if (CostKind != TTI::TCK_RecipThroughput) return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI); + + // vXi8 multiplications are always promoted to vXi16. + if (Opcode == Instruction::Mul && Ty->isVectorTy() && + Ty->getScalarSizeInBits() == 8) { + Type *WideVecTy = + VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); + return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, + TargetTransformInfo::CastContextHint::None, + CostKind) + + getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, + TargetTransformInfo::CastContextHint::None, + CostKind) + + getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); + } + // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -203,7 +221,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, static const CostTblEntry SLMCostTable[] = { { ISD::MUL, MVT::v4i32, 11 }, // pmulld { ISD::MUL, MVT::v8i16, 2 }, // pmullw - { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence. { ISD::FMUL, MVT::f64, 2 }, // mulsd { ISD::FMUL, MVT::v2f64, 4 }, // mulpd { ISD::FMUL, MVT::v4f32, 2 }, // mulps @@ -261,10 +278,9 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // normally expanded to the sequence SRA + SRL + ADD + SRA. // The OperandValue properties may not be the same as that of the previous // operation; conservatively assume OP_None. - int Cost = + InstructionCost Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, - Op2Info, - TargetTransformInfo::OP_None, + Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, Op2Info, @@ -491,14 +507,22 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, } static const CostTblEntry AVX512BWShiftCostTable[] = { + { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence. + { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence. + { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence. + { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence. + { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence. + { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence. + { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence. + { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence. + { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence. + { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw { ISD::SRA, MVT::v8i16, 1 }, // vpsravw - { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw { ISD::SRA, MVT::v16i16, 1 }, // vpsravw - { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw { ISD::SRA, MVT::v32i16, 1 }, // vpsravw @@ -516,6 +540,12 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. + + { ISD::SHL, MVT::v8i32, 1 }, // pslld + { ISD::SRL, MVT::v8i32, 1 }, // psrld + { ISD::SRA, MVT::v8i32, 1 }, // psrad + { ISD::SHL, MVT::v4i64, 1 }, // psllq + { ISD::SRL, MVT::v4i64, 1 }, // psrlq }; if (ST->hasAVX2() && @@ -549,9 +579,9 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, } static const CostTblEntry AVX512DQCostTable[] = { - { ISD::MUL, MVT::v2i64, 1 }, - { ISD::MUL, MVT::v4i64, 1 }, - { ISD::MUL, MVT::v8i64, 1 } + { ISD::MUL, MVT::v2i64, 2 }, // pmullq + { ISD::MUL, MVT::v4i64, 2 }, // pmullq + { ISD::MUL, MVT::v8i64, 2 } // pmullq }; // Look for AVX512DQ lowering tricks for custom cases. @@ -563,10 +593,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. - - { ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence. }; // Look for AVX512BW lowering tricks for custom cases. @@ -575,10 +601,20 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * Entry->Cost; static const CostTblEntry AVX512CostTable[] = { + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 1 }, + { ISD::SRA, MVT::v4i32, 1 }, + { ISD::SHL, MVT::v8i32, 1 }, + { ISD::SRL, MVT::v8i32, 1 }, + { ISD::SRA, MVT::v8i32, 1 }, { ISD::SHL, MVT::v16i32, 1 }, { ISD::SRL, MVT::v16i32, 1 }, { ISD::SRA, MVT::v16i32, 1 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 1 }, + { ISD::SHL, MVT::v4i64, 1 }, + { ISD::SRL, MVT::v4i64, 1 }, { ISD::SHL, MVT::v8i64, 1 }, { ISD::SRL, MVT::v8i64, 1 }, @@ -586,21 +622,28 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v4i64, 1 }, { ISD::SRA, MVT::v8i64, 1 }, - { ISD::MUL, MVT::v64i8, 26 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) - { ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add + { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add + { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/ + { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/ + { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/ }; if (ST->hasAVX512()) @@ -608,18 +651,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * Entry->Cost; static const CostTblEntry AVX2ShiftCostTable[] = { - // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to + // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. - { ISD::SHL, MVT::v4i32, 1 }, - { ISD::SRL, MVT::v4i32, 1 }, - { ISD::SRA, MVT::v4i32, 1 }, - { ISD::SHL, MVT::v8i32, 1 }, - { ISD::SRL, MVT::v8i32, 1 }, - { ISD::SRA, MVT::v8i32, 1 }, - { ISD::SHL, MVT::v2i64, 1 }, - { ISD::SRL, MVT::v2i64, 1 }, - { ISD::SHL, MVT::v4i64, 1 }, - { ISD::SRL, MVT::v4i64, 1 }, + { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org) + { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org) + { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org) + { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org) + { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org) + { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org) + { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org) + { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org) + { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org) + { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org) }; if (ST->hasAVX512()) { @@ -634,8 +677,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, TargetTransformInfo::OP_None); } - // Look for AVX2 lowering tricks. - if (ST->hasAVX2()) { + // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). + if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { if (ISD == ISD::SHL && LT.second == MVT::v16i16 && (Op2Info == TargetTransformInfo::OK_UniformConstantValue || Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) @@ -733,22 +776,28 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, } static const CostTblEntry AVX2CostTable[] = { - { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. - { ISD::SHL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. - { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SHL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. - - { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence. - { ISD::SRL, MVT::v64i8, 22 }, // 2*vpblendvb sequence. - { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. - { ISD::SRL, MVT::v32i16, 20 }, // 2*extend/vpsrlvd/pack sequence. - - { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. - { ISD::SRA, MVT::v64i8, 48 }, // 2*vpblendvb sequence. - { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v32i16, 20 }, // 2*extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. - { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. + { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence. + { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence. + { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. + { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. + { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. + { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. + + { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence. + { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence. + { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. + { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. + { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. + { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. + + { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence. + { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence. + { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence. + { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence. + { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence. { ISD::SUB, MVT::v32i8, 1 }, // psubb { ISD::ADD, MVT::v32i8, 1 }, // paddb @@ -759,16 +808,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SUB, MVT::v4i64, 1 }, // psubq { ISD::ADD, MVT::v4i64, 1 }, // paddq - { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v16i16, 1 }, // pmullw { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) - { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add + { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add + { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ + { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ + { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/ + { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/ { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ @@ -790,7 +841,9 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. { ISD::MUL, MVT::v16i16, 4 }, - { ISD::MUL, MVT::v8i32, 4 }, + { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/ + { ISD::MUL, MVT::v4i64, 12 }, + { ISD::SUB, MVT::v32i8, 4 }, { ISD::ADD, MVT::v32i8, 4 }, { ISD::SUB, MVT::v16i16, 4 }, @@ -800,14 +853,34 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SUB, MVT::v4i64, 4 }, { ISD::ADD, MVT::v4i64, 4 }, - // A v4i64 multiply is custom lowered as two split v2i64 vectors that then - // are lowered as a series of long multiplies(3), shifts(3) and adds(2) - // Because we believe v4i64 to be a legal type, we must also include the - // extract+insert in the cost table. Therefore, the cost here is 18 - // instead of 8. - { ISD::MUL, MVT::v4i64, 18 }, - - { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence. + { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence. + { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld + { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split + { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend. + { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split. + + { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend. + { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split. + { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend. + { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split. + + { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend. + { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split. + { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend. + { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split. + + { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/ + { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/ + + { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/ + { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/ + { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/ { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ @@ -841,6 +914,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ + + { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add }; if (ST->hasSSE42()) @@ -848,26 +923,16 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * Entry->Cost; static const CostTblEntry SSE41CostTable[] = { - { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. - { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. - { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence. + { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence. { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld - { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split - - { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. - { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. - { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. - { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. - { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. - - { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. - { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. - { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. - { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. - { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. + + { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence. + { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence. + { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. + + { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence. + { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence. { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) }; @@ -879,25 +944,21 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. - { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence. + { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence. + { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq. { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. - { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence. + { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence. + { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend. { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. - { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. - { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. - { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. + { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence. + { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence. - { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. { ISD::MUL, MVT::v8i16, 1 }, // pmullw { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add @@ -907,6 +968,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ + { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/ + { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/ + { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/ + { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/ + { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ @@ -922,25 +988,42 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ + { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/ + { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ + { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ - - { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ - { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ - { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ - - { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ - { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ - { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ }; if (ST->hasSSE1()) if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) return LT.first * Entry->Cost; + static const CostTblEntry X64CostTbl[] = { // 64-bit targets + { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ + { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ + }; + + if (ST->is64Bit()) + if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) + return LT.first * Entry->Cost; + + static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets + { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ + { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ + { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ + + { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ + { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ + { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ + }; + + if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) + return LT.first * Entry->Cost; + // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular // registers. The overhead of division is going to dominate most kernels @@ -949,7 +1032,7 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // to hide "20 cycles" for each lane. if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || ISD == ISD::UREM)) { - int ScalarCost = getArithmeticInstrCost( + InstructionCost ScalarCost = getArithmeticInstrCost( Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; @@ -959,12 +1042,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); } -int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, - int Index, VectorType *SubTp) { +InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, + VectorType *BaseTp, + ArrayRef<int> Mask, int Index, + VectorType *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); + Kind = improveShuffleKindFromMask(Kind, Mask); // Treat Transpose as 2-op shuffles - there's no difference in lowering. if (Kind == TTI::SK_Transpose) Kind = TTI::SK_PermuteTwoSrc; @@ -981,7 +1067,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, int NumElts = LT.second.getVectorNumElements(); if ((Index % NumElts) == 0) return 0; - std::pair<int, MVT> SubLT = TLI->getTypeLegalizationCost(DL, SubTp); + std::pair<InstructionCost, MVT> SubLT = + TLI->getTypeLegalizationCost(DL, SubTp); if (SubLT.second.isVector()) { int NumSubElts = SubLT.second.getVectorNumElements(); if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) @@ -1006,8 +1093,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), SubLT.second.getVectorNumElements()); int ExtractIndex = alignDown((Index % NumElts), NumSubElts); - int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, - ExtractIndex, SubTy); + InstructionCost ExtractCost = getShuffleCost( + TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); // If the original size is 32-bits or more, we can use pshufd. Otherwise // if we have SSSE3 we can use pshufb. @@ -1022,6 +1109,20 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, } } + // Subvector insertions are cheap if the subvectors are aligned. + // Note that in general, the insertion starting at the beginning of a vector + // isn't free, because we need to preserve the rest of the wide vector. + if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { + int NumElts = LT.second.getVectorNumElements(); + std::pair<InstructionCost, MVT> SubLT = + TLI->getTypeLegalizationCost(DL, SubTp); + if (SubLT.second.isVector()) { + int NumSubElts = SubLT.second.getVectorNumElements(); + if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) + return SubLT.first; + } + } + // Handle some common (illegal) sub-vector types as they are often very cheap // to shuffle even on targets without PSHUFB. EVT VT = TLI->getValueType(DL, BaseTp); @@ -1074,24 +1175,24 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, // Number of source vectors after legalization: unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; // Number of destination vectors after legalization: - unsigned NumOfDests = LT.first; + InstructionCost NumOfDests = LT.first; auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), LegalVT.getVectorNumElements()); - unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; - return NumOfShuffles * - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); + InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; + return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, + None, 0, nullptr); } - return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } // For 2-input shuffles, we must account for splitting the 2 inputs into many. if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { // We assume that source and destination have the same vector type. - int NumOfDests = LT.first; - int NumOfShufflesPerDest = LT.first * 2 - 1; + InstructionCost NumOfDests = LT.first; + InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; LT.first = NumOfDests * NumOfShufflesPerDest; } @@ -1150,6 +1251,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd + {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca + {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd @@ -1392,26 +1495,29 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } -int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, - TTI::CastContextHint CCH, - TTI::TargetCostKind CostKind, - const Instruction *I) { +InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src, + TTI::CastContextHint CCH, + TTI::TargetCostKind CostKind, + const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); // TODO: Allow non-throughput costs that aren't binary. - auto AdjustCost = [&CostKind](int Cost) { + auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { if (CostKind != TTI::TCK_RecipThroughput) return Cost == 0 ? 0 : 1; return Cost; }; + // The cost tables include both specific, custom (non-legal) src/dst type + // conversions and generic, legalized types. We test for customs first, before + // falling back to legalization. // FIXME: Need a better design of the cost table to handle non-simple types of // potential massive combinations (elem_num x src_type x dst_type). - static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, @@ -1446,10 +1552,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, // widen to zmm + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, // widen to zmm { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, // widen to zmm @@ -1494,11 +1603,15 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, - { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, - { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, - { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdb + { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb @@ -1554,33 +1667,40 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, - { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right - { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right + { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right + { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, - { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, - { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, - { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, - { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, - { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, - { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, - { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, - { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f64, 3 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, + { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, + { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, + { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, - { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 3 }, - { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 3 }, + { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, + { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, + { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, + { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, + { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, @@ -1636,12 +1756,12 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, @@ -1662,6 +1782,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb @@ -1696,14 +1819,31 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, - { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, + + { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, @@ -1711,20 +1851,17 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, - { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, - { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, - - { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 3 }, - { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 3 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, + { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, - - { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, - { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, }; static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { @@ -1732,252 +1869,307 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, - { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, - { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, + { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, + { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, + + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, + { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, + + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, + + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, }; static const TypeConversionCostTblEntry AVXConversionTbl[] = { - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 4 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 4 }, - - { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, - { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, - { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, - - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 4 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, - { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 4 }, - { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 4 }, - { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 11 }, - { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 9 }, - { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, - { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 11 }, - - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, - { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, - { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, - { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i8, 3 }, - { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 8 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 3 }, - { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i16, 3 }, - { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, - { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, - { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, - - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 2 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 5 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 5 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 6 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 6 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, - { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 9 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, - { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 6 }, - // The generic code to compute the scalar overhead is currently broken. - // Workaround this limitation by estimating the scalarization overhead - // here. We have roughly 10 instructions per scalar element. - // Multiply that by the vector width. - // FIXME: remove that when PR19268 is fixed. - { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, - { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 13 }, - - { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 4 }, - { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f64, 3 }, - { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f64, 2 }, - { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 3 }, - - { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f64, 3 }, - { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f64, 2 }, - { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 4 }, - { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 3 }, - // This node is expanded into scalarized operations but BasicTTI is overly - // optimistic estimating its cost. It computes 3 per element (one - // vector-extract, one scalar conversion and one vector-insert). The - // problem is that the inserts form a read-modify-write chain so latency - // should be factored in too. Inflating the cost per element by 1. - { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 8*4 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, + + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, + + { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, + { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, + { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, + + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb + { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, + + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, + + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, + + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, + { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, + { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, + { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, + { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, + { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, + { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, + + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, + { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, + { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, + { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, + { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, }; static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, - - { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, - { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, // These truncates end up widening elements. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD - { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 1 }, - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 1 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, - { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, - { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, - { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, - { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB - + { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, + + { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, + { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, + { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, + + { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, - - { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 3 }, - { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 3 }, - - { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 3 }, - { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 3 }, - { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, + + { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, + { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, + { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, + { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, + + { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, + { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, }; static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { - // These are somewhat magic numbers justified by looking at the output of - // Intel's IACA, running some kernels and making sure when we take - // legalization into account the throughput will be overestimated. - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 2*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2*10 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - - { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 4 }, - { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 2 }, - { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 }, - { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, - { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, - { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 4 }, - - { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 }, - - { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 }, - { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 }, - + // These are somewhat magic numbers justified by comparing the + // output of llvm-mca for our various supported scheduler models + // and basing it off the worst case scenario. + { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, + { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, + { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, + + { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, + { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, + { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, + + { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, + { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, + { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, + { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, + { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, + { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, + + { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, - { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, - { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 4 }, - { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 4 }, - { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 }, - { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 2 }, - { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, - { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, - - { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, - { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, - { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i8, 4 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 }, - { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, - { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, - { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, - { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 10 }, - { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, - { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, - { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, - { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, - { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, - { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 5 }, + { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, + { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, + + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, + { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, // These truncates are really widening elements. { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD @@ -1987,113 +2179,185 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW - { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // PAND+PACKUSWB - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // PAND+PACKUSWB - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // PAND+PACKUSWB + { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, - { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 3 }, // PAND+2*PACKUSWB - { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, - { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, - { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, - { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, + { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, - { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, - { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB - { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW - { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1 }, // PSHUFD + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB + { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW + { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD }; - std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); - std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); - - if (ST->hasSSE2() && !ST->hasAVX()) { - if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, - LTDest.second, LTSrc.second)) - return AdjustCost(LTSrc.first * Entry->Cost); - } - + // Attempt to map directly to (simple) MVT types to let us match custom entries. EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); // The function getSimpleVT only handles simple value types. - if (!SrcTy.isSimple() || !DstTy.isSimple()) - return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind)); - - MVT SimpleSrcTy = SrcTy.getSimpleVT(); - MVT SimpleDstTy = DstTy.getSimpleVT(); + if (SrcTy.isSimple() && DstTy.isSimple()) { + MVT SimpleSrcTy = SrcTy.getSimpleVT(); + MVT SimpleDstTy = DstTy.getSimpleVT(); + + if (ST->useAVX512Regs()) { + if (ST->hasBWI()) + if (const auto *Entry = ConvertCostTableLookup( + AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup( + AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup( + AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + } - if (ST->useAVX512Regs()) { if (ST->hasBWI()) - if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) + if (const auto *Entry = ConvertCostTableLookup( + AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) return AdjustCost(Entry->Cost); if (ST->hasDQI()) - if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) + if (const auto *Entry = ConvertCostTableLookup( + AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) return AdjustCost(Entry->Cost); if (ST->hasAVX512()) - if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, + if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) return AdjustCost(Entry->Cost); + + if (ST->hasAVX2()) { + if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + } + + if (ST->hasAVX()) { + if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + } + + if (ST->hasSSE41()) { + if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + } + + if (ST->hasSSE2()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + SimpleDstTy, SimpleSrcTy)) + return AdjustCost(Entry->Cost); + } + } + + // Fall back to legalized types. + std::pair<InstructionCost, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); + std::pair<InstructionCost, MVT> LTDest = + TLI->getTypeLegalizationCost(DL, Dst); + + if (ST->useAVX512Regs()) { + if (ST->hasBWI()) + if (const auto *Entry = ConvertCostTableLookup( + AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup( + AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup( + AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); } if (ST->hasBWI()) if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) - return AdjustCost(Entry->Cost); + LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); if (ST->hasDQI()) if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) - return AdjustCost(Entry->Cost); + LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); if (ST->hasAVX512()) if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) - return AdjustCost(Entry->Cost); + LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); - if (ST->hasAVX2()) { + if (ST->hasAVX2()) if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) - return AdjustCost(Entry->Cost); - } + LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); - if (ST->hasAVX()) { + if (ST->hasAVX()) if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) - return AdjustCost(Entry->Cost); - } + LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); - if (ST->hasSSE41()) { + if (ST->hasSSE41()) if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) - return AdjustCost(Entry->Cost); - } + LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); - if (ST->hasSSE2()) { + if (ST->hasSSE2()) if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, - SimpleDstTy, SimpleSrcTy)) - return AdjustCost(Entry->Cost); + LTDest.second, LTSrc.second)) + return AdjustCost(std::max(LTSrc.first, LTDest.first) * Entry->Cost); + + // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for + // sitofp. + if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && + 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { + Type *ExtSrc = Src->getWithNewBitWidth(32); + unsigned ExtOpc = + (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; + + // For scalar loads the extend would be free. + InstructionCost ExtCost = 0; + if (!(Src->isIntegerTy() && I && isa<LoadInst>(I->getOperand(0)))) + ExtCost = getCastInstrCost(ExtOpc, ExtSrc, Src, CCH, CostKind); + + return ExtCost + getCastInstrCost(Instruction::SIToFP, Dst, ExtSrc, + TTI::CastContextHint::None, CostKind); + } + + // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi + // i32. + if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && + 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { + Type *TruncDst = Dst->getWithNewBitWidth(32); + return getCastInstrCost(Instruction::FPToSI, TruncDst, Src, CCH, CostKind) + + getCastInstrCost(Instruction::Trunc, Dst, TruncDst, + TTI::CastContextHint::None, CostKind); } return AdjustCost( BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); } -int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, - CmpInst::Predicate VecPred, - TTI::TargetCostKind CostKind, - const Instruction *I) { +InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy, + CmpInst::Predicate VecPred, + TTI::TargetCostKind CostKind, + const Instruction *I) { // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -2279,8 +2543,9 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } -int X86TTIImpl::getTypeBasedIntrinsicInstrCost( - const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { +InstructionCost +X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { // Costs should match the codegen from: // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll @@ -2312,6 +2577,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::BITREVERSE, MVT::v16i32, 5 }, { ISD::BITREVERSE, MVT::v32i16, 5 }, { ISD::BITREVERSE, MVT::v64i8, 5 }, + { ISD::BSWAP, MVT::v8i64, 1 }, + { ISD::BSWAP, MVT::v16i32, 1 }, + { ISD::BSWAP, MVT::v32i16, 1 }, { ISD::CTLZ, MVT::v8i64, 23 }, { ISD::CTLZ, MVT::v16i32, 22 }, { ISD::CTLZ, MVT::v32i16, 18 }, @@ -2352,6 +2620,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::BITREVERSE, MVT::v16i32, 24 }, { ISD::BITREVERSE, MVT::v32i16, 10 }, { ISD::BITREVERSE, MVT::v64i8, 10 }, + { ISD::BSWAP, MVT::v8i64, 4 }, + { ISD::BSWAP, MVT::v16i32, 4 }, + { ISD::BSWAP, MVT::v32i16, 4 }, { ISD::CTLZ, MVT::v8i64, 29 }, { ISD::CTLZ, MVT::v16i32, 35 }, { ISD::CTLZ, MVT::v32i16, 28 }, @@ -2670,6 +2941,7 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( static const CostTblEntry X64CostTbl[] = { // 64-bit targets { ISD::ABS, MVT::i64, 2 }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, 14 }, + { ISD::BSWAP, MVT::i64, 1 }, { ISD::CTLZ, MVT::i64, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTTZ, MVT::i64, 3 }, // TEST+BSF+CMOV/BRANCH { ISD::CTPOP, MVT::i64, 10 }, @@ -2683,6 +2955,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( { ISD::BITREVERSE, MVT::i32, 14 }, { ISD::BITREVERSE, MVT::i16, 14 }, { ISD::BITREVERSE, MVT::i8, 11 }, + { ISD::BSWAP, MVT::i32, 1 }, + { ISD::BSWAP, MVT::i16, 1 }, // ROL { ISD::CTLZ, MVT::i32, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i16, 4 }, // BSR+XOR or BSR+XOR+CMOV { ISD::CTLZ, MVT::i8, 4 }, // BSR+XOR or BSR+XOR+CMOV @@ -2782,7 +3056,7 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, OpTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -2802,7 +3076,8 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( return LT.first * Cost; } - auto adjustTableCost = [](const CostTblEntry &Entry, int LegalizationCost, + auto adjustTableCost = [](const CostTblEntry &Entry, + InstructionCost LegalizationCost, FastMathFlags FMF) { // If there are no NANs to deal with, then these are reduced to a // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we @@ -2893,6 +3168,17 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( return adjustTableCost(*Entry, LT.first, ICA.getFlags()); } + if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { + if (const Instruction *II = ICA.getInst()) { + if (II->hasOneUse() && isa<StoreInst>(II->user_back())) + return TTI::TCC_Free; + if (auto *LI = dyn_cast<LoadInst>(II->getOperand(0))) { + if (LI->hasOneUse()) + return TTI::TCC_Free; + } + } + } + // TODO - add BMI (TZCNT) scalar handling if (ST->is64Bit()) @@ -2906,8 +3192,9 @@ int X86TTIImpl::getTypeBasedIntrinsicInstrCost( return BaseT::getIntrinsicInstrCost(ICA, CostKind); } -int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, - TTI::TargetCostKind CostKind) { +InstructionCost +X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, + TTI::TargetCostKind CostKind) { if (ICA.isTypeBasedOnly()) return getTypeBasedIntrinsicInstrCost(ICA, CostKind); @@ -2983,7 +3270,8 @@ int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (ISD != ISD::DELETED_NODE) { // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); + std::pair<InstructionCost, MVT> LT = + TLI->getTypeLegalizationCost(DL, RetTy); MVT MTy = LT.second; // Attempt to lookup cost. @@ -3006,7 +3294,8 @@ int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); } -int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { +InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { static const CostTblEntry SLMCostTbl[] = { { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, @@ -3018,10 +3307,40 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { Type *ScalarType = Val->getScalarType(); int RegisterFileMoveCost = 0; + // Non-immediate extraction/insertion can be handled as a sequence of + // aliased loads+stores via the stack. + if (Index == -1U && (Opcode == Instruction::ExtractElement || + Opcode == Instruction::InsertElement)) { + // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: + // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. + + // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. + assert(isa<FixedVectorType>(Val) && "Fixed vector type expected"); + Align VecAlign = DL.getPrefTypeAlign(Val); + Align SclAlign = DL.getPrefTypeAlign(ScalarType); + + // Extract - store vector to stack, load scalar. + if (Opcode == Instruction::ExtractElement) { + return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, + TTI::TargetCostKind::TCK_RecipThroughput) + + getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, + TTI::TargetCostKind::TCK_RecipThroughput); + } + // Insert - store vector to stack, store scalar, load vector. + if (Opcode == Instruction::InsertElement) { + return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, + TTI::TargetCostKind::TCK_RecipThroughput) + + getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, + TTI::TargetCostKind::TCK_RecipThroughput) + + getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, + TTI::TargetCostKind::TCK_RecipThroughput); + } + } + if (Index != -1U && (Opcode == Instruction::ExtractElement || Opcode == Instruction::InsertElement)) { // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -3079,13 +3398,14 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { // subvector move(s). // If the vector type is already less than 128-bits then don't reduce it. // TODO: Under what circumstances should we shuffle using the full width? - int ShuffleCost = 1; + InstructionCost ShuffleCost = 1; if (Opcode == Instruction::InsertElement) { auto *SubTy = cast<VectorType>(Val); EVT VT = TLI->getValueType(DL, Val); if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) SubTy = FixedVectorType::get(ScalarType, SubNumElts); - ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy); + ShuffleCost = + getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy); } int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; @@ -3099,15 +3419,16 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost; } -unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty, - const APInt &DemandedElts, - bool Insert, bool Extract) { - unsigned Cost = 0; +InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, + const APInt &DemandedElts, + bool Insert, + bool Extract) { + InstructionCost Cost = 0; // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. if (Insert) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); MVT MScalarTy = LT.second.getScalarType(); if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || @@ -3131,8 +3452,10 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty, // Case#2: inserting into 5th index needs extracti128 + vpinsrd + // inserti128. // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. - unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * LT.first; - unsigned NumElts = LT.second.getVectorNumElements() * LT.first; + const int CostValue = *LT.first.getValue(); + assert(CostValue >= 0 && "Negative cost!"); + unsigned Num128Lanes = LT.second.getSizeInBits() / 128 * CostValue; + unsigned NumElts = LT.second.getVectorNumElements() * CostValue; APInt WidenedDemandedElts = DemandedElts.zextOrSelf(NumElts); unsigned Scale = NumElts / Num128Lanes; // We iterate each 128-lane, and check if we need a @@ -3182,10 +3505,11 @@ unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty, return Cost; } -int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I) { +InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + MaybeAlign Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I) { // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) { if (auto *SI = dyn_cast_or_null<StoreInst>(I)) { @@ -3199,57 +3523,146 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return TTI::TCC_Basic; } - // Handle non-power-of-two vectors such as <3 x float> - if (auto *VTy = dyn_cast<FixedVectorType>(Src)) { - unsigned NumElem = VTy->getNumElements(); - - // Handle a few common cases: - // <3 x float> - if (NumElem == 3 && VTy->getScalarSizeInBits() == 32) - // Cost = 64 bit store + extract + 32 bit store. - return 3; - - // <3 x double> - if (NumElem == 3 && VTy->getScalarSizeInBits() == 64) - // Cost = 128 bit store + unpack + 64 bit store. - return 3; - - // Assume that all other non-power-of-two numbers are scalarized. - if (!isPowerOf2_32(NumElem)) { - APInt DemandedElts = APInt::getAllOnesValue(NumElem); - int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, - AddressSpace, CostKind); - int SplitCost = getScalarizationOverhead(VTy, DemandedElts, - Opcode == Instruction::Load, - Opcode == Instruction::Store); - return NumElem * Cost + SplitCost; - } - } - + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); // Type legalization can't handle structs - if (TLI->getValueType(DL, Src, true) == MVT::Other) + if (TLI->getValueType(DL, Src, true) == MVT::Other) return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); - assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && - "Invalid Opcode"); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + + auto *VTy = dyn_cast<FixedVectorType>(Src); + + // Handle the simple case of non-vectors. + // NOTE: this assumes that legalization never creates vector from scalars! + if (!VTy || !LT.second.isVector()) + // Each load/store unit costs 1. + return LT.first * 1; + + bool IsLoad = Opcode == Instruction::Load; - // Each load/store unit costs 1. - int Cost = LT.first * 1; + Type *EltTy = VTy->getElementType(); - // This isn't exactly right. We're using slow unaligned 32-byte accesses as a - // proxy for a double-pumped AVX memory interface such as on Sandybridge. - if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow()) - Cost *= 2; + const int EltTyBits = DL.getTypeSizeInBits(EltTy); + + InstructionCost Cost = 0; + + // Source of truth: how many elements were there in the original IR vector? + const unsigned SrcNumElt = VTy->getNumElements(); + + // How far have we gotten? + int NumEltRemaining = SrcNumElt; + // Note that we intentionally capture by-reference, NumEltRemaining changes. + auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; + + const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8); + + // Note that even if we can store 64 bits of an XMM, we still operate on XMM. + const unsigned XMMBits = 128; + if (XMMBits % EltTyBits != 0) + // Vector size must be a multiple of the element size. I.e. no padding. + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind); + const int NumEltPerXMM = XMMBits / EltTyBits; + + auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM); + + for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; + NumEltRemaining > 0; CurrOpSizeBytes /= 2) { + // How many elements would a single op deal with at once? + if ((8 * CurrOpSizeBytes) % EltTyBits != 0) + // Vector size must be a multiple of the element size. I.e. no padding. + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind); + int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; + + assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?"); + assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || + (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && + "Unless we haven't halved the op size yet, " + "we have less than two op's sized units of work left."); + + auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM + ? FixedVectorType::get(EltTy, CurrNumEltPerOp) + : XMMVecTy; + + assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && + "After halving sizes, the vector elt count is no longer a multiple " + "of number of elements per operation?"); + auto *CoalescedVecTy = + CurrNumEltPerOp == 1 + ? CurrVecTy + : FixedVectorType::get( + IntegerType::get(Src->getContext(), + EltTyBits * CurrNumEltPerOp), + CurrVecTy->getNumElements() / CurrNumEltPerOp); + assert(DL.getTypeSizeInBits(CoalescedVecTy) == + DL.getTypeSizeInBits(CurrVecTy) && + "coalesciing elements doesn't change vector width."); + + while (NumEltRemaining > 0) { + assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?"); + + // Can we use this vector size, as per the remaining element count? + // Iff the vector is naturally aligned, we can do a wide load regardless. + if (NumEltRemaining < CurrNumEltPerOp && + (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && + CurrOpSizeBytes != 1) + break; // Try smalled vector size. + + bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; + + // If we have fully processed the previous reg, we need to replenish it. + if (SubVecEltsLeft == 0) { + SubVecEltsLeft += CurrVecTy->getNumElements(); + // And that's free only for the 0'th subvector of a legalized vector. + if (!Is0thSubVec) + Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector + : TTI::ShuffleKind::SK_ExtractSubvector, + VTy, None, NumEltDone(), CurrVecTy); + } + + // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, + // for smaller widths (32/16/8) we have to insert/extract them separately. + // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, + // but let's pretend that it is also true for 16/8 bit wide ops...) + if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { + int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; + assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && ""); + int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; + APInt DemandedElts = + APInt::getBitsSet(CoalescedVecTy->getNumElements(), + CoalescedVecEltIdx, CoalescedVecEltIdx + 1); + assert(DemandedElts.countPopulation() == 1 && "Inserting single value"); + Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, + !IsLoad); + } + + // This isn't exactly right. We're using slow unaligned 32-byte accesses + // as a proxy for a double-pumped AVX memory interface such as on + // Sandybridge. + if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) + Cost += 2; + else + Cost += 1; + + SubVecEltsLeft -= CurrNumEltPerOp; + NumEltRemaining -= CurrNumEltPerOp; + Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes); + } + } + + assert(NumEltRemaining <= 0 && "Should have processed all the elements."); return Cost; } -int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, - Align Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind) { +InstructionCost +X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind) { bool IsLoad = (Instruction::Load == Opcode); bool IsStore = (Instruction::Store == Opcode); @@ -3262,40 +3675,39 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, auto *MaskTy = FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || - (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) || - !isPowerOf2_32(NumElem)) { + (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { // Scalarization APInt DemandedElts = APInt::getAllOnesValue(NumElem); - int MaskSplitCost = + InstructionCost MaskSplitCost = getScalarizationOverhead(MaskTy, DemandedElts, false, true); - int ScalarCompareCost = getCmpSelInstrCost( + InstructionCost ScalarCompareCost = getCmpSelInstrCost( Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, CmpInst::BAD_ICMP_PREDICATE, CostKind); - int BranchCost = getCFInstrCost(Instruction::Br, CostKind); - int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - int ValueSplitCost = + InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); + InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + InstructionCost ValueSplitCost = getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); - int MemopCost = + InstructionCost MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace, CostKind); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } // Legalize the type. - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); auto VT = TLI->getValueType(DL, SrcVTy); - int Cost = 0; + InstructionCost Cost = 0; if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) - // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) + - getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); + // Promotion requires extend/truncate for data and a shuffle for mask. + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); - else if (LT.second.getVectorNumElements() > NumElem) { + else if (LT.first * LT.second.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes - Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy); } // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. @@ -3306,8 +3718,9 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, return Cost + LT.first; } -int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, - const SCEV *Ptr) { +InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, + ScalarEvolution *SE, + const SCEV *Ptr) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -3331,12 +3744,12 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return BaseT::getAddressComputationCost(Ty, SE, Ptr); } -int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwise, - TTI::TargetCostKind CostKind) { - // Just use the default implementation for pair reductions. - if (IsPairwise) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind); +InstructionCost +X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + Optional<FastMathFlags> FMF, + TTI::TargetCostKind CostKind) { + if (TTI::requiresOrderedReduction(FMF)) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. @@ -3348,6 +3761,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, static const CostTblEntry SSE2CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, + { ISD::FADD, MVT::v2f32, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 @@ -3394,13 +3808,23 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, return Entry->Cost; } - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; auto *ValVTy = cast<FixedVectorType>(ValTy); - unsigned ArithmeticCost = 0; + // Special case: vXi8 mul reductions are performed as vXi16. + if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { + auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16); + auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements()); + return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy, + TargetTransformInfo::CastContextHint::None, + CostKind) + + getArithmeticReductionCost(Opcode, WideVecTy, FMF, CostKind); + } + + InstructionCost ArithmeticCost = 0; if (LT.first != 1 && MTy.isVector() && MTy.getVectorNumElements() < ValVTy->getNumElements()) { // Type needs to be split. We need LT.first - 1 arithmetic ops. @@ -3470,7 +3894,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, // Handle bool allof/anyof patterns. if (ValVTy->getElementType()->isIntegerTy(1)) { - unsigned ArithmeticCost = 0; + InstructionCost ArithmeticCost = 0; if (LT.first != 1 && MTy.isVector() && MTy.getVectorNumElements() < ValVTy->getNumElements()) { // Type needs to be split. We need LT.first - 1 arithmetic ops. @@ -3493,8 +3917,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) return ArithmeticCost + Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); } unsigned NumVecElts = ValVTy->getNumElements(); @@ -3503,10 +3926,9 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, // Special case power of 2 reductions where the scalar type isn't changed // by type legalization. if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ValVTy, FMF, CostKind); - unsigned ReductionCost = 0; + InstructionCost ReductionCost = 0; auto *Ty = ValVTy; if (LT.first != 1 && MTy.isVector() && @@ -3529,7 +3951,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); ReductionCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -3541,7 +3963,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, ShufTy = FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -3552,7 +3974,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, ShufTy = FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -3572,8 +3994,9 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } -int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) { - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); +InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, + bool IsUnsigned) { + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); MVT MTy = LT.second; @@ -3691,21 +4114,19 @@ int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // Otherwise fall back to cmp+select. - return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, - CostKind) + - getCmpSelInstrCost(Instruction::Select, Ty, CondTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind); + InstructionCost Result = + getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, + CostKind) + + getCmpSelInstrCost(Instruction::Select, Ty, CondTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + return Result; } -int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, - bool IsPairwise, bool IsUnsigned, - TTI::TargetCostKind CostKind) { - // Just use the default implementation for pair reductions. - if (IsPairwise) - return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, - CostKind); - - std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); +InstructionCost +X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, + bool IsUnsigned, + TTI::TargetCostKind CostKind) { + std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -3785,7 +4206,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, unsigned NumVecElts = ValVTy->getNumElements(); auto *Ty = ValVTy; - unsigned MinMaxCost = 0; + InstructionCost MinMaxCost = 0; if (LT.first != 1 && MTy.isVector() && MTy.getVectorNumElements() < ValVTy->getNumElements()) { // Type needs to be split. We need LT.first - 1 operations ops. @@ -3820,8 +4241,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, // by type legalization. if (!isPowerOf2_32(ValVTy->getNumElements()) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned, - CostKind); + return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsUnsigned, CostKind); // Now handle reduction with the legal type, taking into account size changes // at each level. @@ -3833,7 +4253,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); MinMaxCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -3844,7 +4264,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -3853,7 +4273,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -3878,7 +4298,7 @@ int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy, /// Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -int X86TTIImpl::getIntImmCost(int64_t Val) { +InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { if (Val == 0) return TTI::TCC_Free; @@ -3888,8 +4308,8 @@ int X86TTIImpl::getIntImmCost(int64_t Val) { return 2 * TTI::TCC_Basic; } -int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind) { +InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -3913,20 +4333,20 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, // Split the constant into 64-bit chunks and calculate the cost for each // chunk. - int Cost = 0; + InstructionCost Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); int64_t Val = Tmp.getSExtValue(); Cost += getIntImmCost(Val); } // We need at least one instruction to materialize the constant. - return std::max(1, Cost); + return std::max<InstructionCost>(1, Cost); } -int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind, - Instruction *Inst) { +InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind, + Instruction *Inst) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -4013,7 +4433,7 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, if (Idx == ImmIdx) { int NumConstants = divideCeil(BitSize, 64); - int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); + InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); return (Cost <= NumConstants * TTI::TCC_Basic) ? static_cast<int>(TTI::TCC_Free) : Cost; @@ -4022,9 +4442,9 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); } -int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty, - TTI::TargetCostKind CostKind) { +InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty, + TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -4058,12 +4478,13 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); } -unsigned -X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { +InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, + TTI::TargetCostKind CostKind, + const Instruction *I) { if (CostKind != TTI::TCK_RecipThroughput) return Opcode == Instruction::PHI ? 0 : 1; // Branches are assumed to be predicted. - return CostKind == TTI::TCK_RecipThroughput ? 0 : 1; + return 0; } int X86TTIImpl::getGatherOverhead() const { @@ -4088,8 +4509,9 @@ int X86TTIImpl::getScatterOverhead() const { // Return an average cost of Gather / Scatter instruction, maybe improved later. // FIXME: Add TargetCostKind support. -int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, - Align Alignment, unsigned AddressSpace) { +InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, + const Value *Ptr, Align Alignment, + unsigned AddressSpace) { assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); @@ -4131,9 +4553,12 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, auto *IndexVTy = FixedVectorType::get( IntegerType::get(SrcVTy->getContext(), IndexSize), VF); - std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); - std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); - int SplitFactor = std::max(IdxsLT.first, SrcLT.first); + std::pair<InstructionCost, MVT> IdxsLT = + TLI->getTypeLegalizationCost(DL, IndexVTy); + std::pair<InstructionCost, MVT> SrcLT = + TLI->getTypeLegalizationCost(DL, SrcVTy); + InstructionCost::CostType SplitFactor = + *std::max(IdxsLT.first, SrcLT.first).getValue(); if (SplitFactor > 1) { // Handle splitting of vector of pointers auto *SplitSrcTy = @@ -4161,32 +4586,32 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr, /// AddressSpace - pointer[s] address space. /// /// FIXME: Add TargetCostKind support. -int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, - bool VariableMask, Align Alignment, - unsigned AddressSpace) { +InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, + bool VariableMask, Align Alignment, + unsigned AddressSpace) { unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); APInt DemandedElts = APInt::getAllOnesValue(VF); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - int MaskUnpackCost = 0; + InstructionCost MaskUnpackCost = 0; if (VariableMask) { auto *MaskTy = FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); MaskUnpackCost = getScalarizationOverhead(MaskTy, DemandedElts, false, true); - int ScalarCompareCost = getCmpSelInstrCost( + InstructionCost ScalarCompareCost = getCmpSelInstrCost( Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, CmpInst::BAD_ICMP_PREDICATE, CostKind); - int BranchCost = getCFInstrCost(Instruction::Br, CostKind); + InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); } // The cost of the scalar loads/stores. - int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), - MaybeAlign(Alignment), AddressSpace, - CostKind); + InstructionCost MemoryOpCost = + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + MaybeAlign(Alignment), AddressSpace, CostKind); - int InsertExtractCost = 0; + InstructionCost InsertExtractCost = 0; if (Opcode == Instruction::Load) for (unsigned i = 0; i < VF; ++i) // Add the cost of inserting each scalar load into the vector @@ -4202,11 +4627,10 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, } /// Calculate the cost of Gather / Scatter operation -int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, - const Value *Ptr, bool VariableMask, - Align Alignment, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr) { +InstructionCost X86TTIImpl::getGatherScatterOpCost( + unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr) { if (CostKind != TTI::TCK_RecipThroughput) { if ((Opcode == Instruction::Load && isLegalMaskedGather(SrcVTy, Align(Alignment))) || @@ -4218,7 +4642,6 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, } assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); - unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements(); PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); if (!PtrTy && Ptr->getType()->isVectorTy()) PtrTy = dyn_cast<PointerType>( @@ -4226,22 +4649,10 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, assert(PtrTy && "Unexpected type for Ptr argument"); unsigned AddressSpace = PtrTy->getAddressSpace(); - bool Scalarize = false; if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy, Align(Alignment))) || (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy, Align(Alignment)))) - Scalarize = true; - // Gather / Scatter for vector 2 is not profitable on KNL / SKX - // Vector-4 of gather/scatter instruction does not exist on KNL. - // We can extend it to 8 elements, but zeroing upper bits of - // the mask vector will add more instructions. Right now we give the scalar - // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction - // is better in the VariableMask case. - if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) - Scalarize = true; - - if (Scalarize) return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); @@ -4377,6 +4788,14 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { unsigned NumElts = DataVTy->getNumElements(); if (NumElts == 1) return false; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we give the scalar + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter + // instruction is better in the VariableMask case. + if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))) + return false; } Type *ScalarTy = DataTy->getScalarType(); if (ScalarTy->isPointerTy()) @@ -4493,7 +4912,7 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() { // computing the cost using a generic formula as a function of generic // shuffles. We therefore use a lookup table instead, filled according to // the instruction sequences that codegen currently generates. -int X86TTIImpl::getInterleavedMemoryOpCostAVX2( +InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2( unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { @@ -4507,8 +4926,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2( // TODO: Support also strided loads (interleaved-groups with gaps). if (Indices.size() && Indices.size() != Factor) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, - CostKind); + Alignment, AddressSpace, CostKind); // VecTy for interleave memop is <VF*Factor x Elt>. // So, for VF=4, Interleave Factor = 3, Element type = i32 we have @@ -4520,86 +4938,78 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2( // (see MachineValueType.h::getVectorVT()). if (!LegalVT.isVector()) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, - CostKind); + Alignment, AddressSpace, CostKind); unsigned VF = VecTy->getNumElements() / Factor; Type *ScalarTy = VecTy->getElementType(); + // Deduplicate entries, model floats/pointers as appropriately-sized integers. + if (!ScalarTy->isIntegerTy()) + ScalarTy = + Type::getIntNTy(ScalarTy->getContext(), DL.getTypeSizeInBits(ScalarTy)); - // Calculate the number of memory operations (NumOfMemOps), required - // for load/store the VecTy. - unsigned VecTySize = DL.getTypeStoreSize(VecTy); - unsigned LegalVTSize = LegalVT.getStoreSize(); - unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; - - // Get the cost of one memory operation. - auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), - LegalVT.getVectorNumElements()); - unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, - MaybeAlign(Alignment), AddressSpace, - CostKind); + // Get the cost of all the memory operations. + InstructionCost MemOpCosts = getMemoryOpCost( + Opcode, VecTy, MaybeAlign(Alignment), AddressSpace, CostKind); auto *VT = FixedVectorType::get(ScalarTy, VF); EVT ETy = TLI->getValueType(DL, VT); if (!ETy.isSimple()) return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, - Alignment, AddressSpace, - CostKind); + Alignment, AddressSpace, CostKind); // TODO: Complete for other data-types and strides. - // Each combination of Stride, ElementTy and VF results in a different + // Each combination of Stride, element bit width and VF results in a different // sequence; The cost tables are therefore accessed with: - // Factor (stride) and VectorType=VFxElemType. + // Factor (stride) and VectorType=VFxiN. // The Cost accounts only for the shuffle sequence; // The cost of the loads/stores is accounted for separately. // static const CostTblEntry AVX2InterleavedLoadTbl[] = { - { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64 - { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64 - - { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 - { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 - { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 - { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 - { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 - { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 - - { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 - { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 - { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 - { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 - { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 - - { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 + {2, MVT::v4i64, 6}, // (load 8i64 and) deinterleave into 2 x 4i64 + + {3, MVT::v2i8, 10}, // (load 6i8 and) deinterleave into 3 x 2i8 + {3, MVT::v4i8, 4}, // (load 12i8 and) deinterleave into 3 x 4i8 + {3, MVT::v8i8, 9}, // (load 24i8 and) deinterleave into 3 x 8i8 + {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 + {3, MVT::v32i8, 13}, // (load 96i8 and) deinterleave into 3 x 32i8 + + {3, MVT::v8i32, 17}, // (load 24i32 and) deinterleave into 3 x 8i32 + + {4, MVT::v2i8, 12}, // (load 8i8 and) deinterleave into 4 x 2i8 + {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 + {4, MVT::v8i8, 20}, // (load 32i8 and) deinterleave into 4 x 8i8 + {4, MVT::v16i8, 39}, // (load 64i8 and) deinterleave into 4 x 16i8 + {4, MVT::v32i8, 80}, // (load 128i8 and) deinterleave into 4 x 32i8 + + {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 }; static const CostTblEntry AVX2InterleavedStoreTbl[] = { - { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store) - { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store) - - { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) - { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) - { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) - { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) - { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) - - { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) - { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) - { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) - { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) - { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) + {2, MVT::v4i64, 6}, // interleave 2 x 4i64 into 8i64 (and store) + + {3, MVT::v2i8, 7}, // interleave 3 x 2i8 into 6i8 (and store) + {3, MVT::v4i8, 8}, // interleave 3 x 4i8 into 12i8 (and store) + {3, MVT::v8i8, 11}, // interleave 3 x 8i8 into 24i8 (and store) + {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) + {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) + + {4, MVT::v2i8, 12}, // interleave 4 x 2i8 into 8i8 (and store) + {4, MVT::v4i8, 9}, // interleave 4 x 4i8 into 16i8 (and store) + {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) + {4, MVT::v16i8, 10}, // interleave 4 x 16i8 into 64i8 (and store) + {4, MVT::v32i8, 12} // interleave 4 x 32i8 into 128i8 (and store) }; if (Opcode == Instruction::Load) { if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, ETy.getSimpleVT())) - return NumOfMemOps * MemOpCost + Entry->Cost; + return MemOpCosts + Entry->Cost; } else { assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) - return NumOfMemOps * MemOpCost + Entry->Cost; + return MemOpCosts + Entry->Cost; } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, @@ -4610,7 +5020,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2( // \p Indices contains indices for strided load. // \p Factor - the factor of interleaving. // AVX-512 provides 3-src shuffles that significantly reduces the cost. -int X86TTIImpl::getInterleavedMemoryOpCostAVX512( +InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { @@ -4634,9 +5044,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512( // Get the cost of one memory operation. auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(), LegalVT.getVectorNumElements()); - unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, - MaybeAlign(Alignment), AddressSpace, - CostKind); + InstructionCost MemOpCost = getMemoryOpCost( + Opcode, SingleMemOpTy, MaybeAlign(Alignment), AddressSpace, CostKind); unsigned VF = VecTy->getNumElements() / Factor; MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); @@ -4665,14 +5074,14 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512( TTI::ShuffleKind ShuffleKind = (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; - unsigned ShuffleCost = - getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); + InstructionCost ShuffleCost = + getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr); unsigned NumOfLoadsInInterleaveGrp = Indices.size() ? Indices.size() : Factor; auto *ResultTy = FixedVectorType::get(VecTy->getElementType(), VecTy->getNumElements() / Factor); - unsigned NumOfResults = + InstructionCost NumOfResults = getTLI()->getTypeLegalizationCost(DL, ResultTy).first * NumOfLoadsInInterleaveGrp; @@ -4688,12 +5097,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512( // The SK_MergeTwoSrc shuffle clobbers one of src operands. // When we have more than one destination, we need additional instructions // to keep sources. - unsigned NumOfMoves = 0; + InstructionCost NumOfMoves = 0; if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; - int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + - NumOfUnfoldedLoads * MemOpCost + NumOfMoves; + InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + + NumOfUnfoldedLoads * MemOpCost + NumOfMoves; return Cost; } @@ -4721,19 +5130,20 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512( // There is no strided stores meanwhile. And store can't be folded in // shuffle. unsigned NumOfSources = Factor; // The number of values to be merged. - unsigned ShuffleCost = - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); + InstructionCost ShuffleCost = + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr); unsigned NumOfShufflesPerStore = NumOfSources - 1; // The SK_MergeTwoSrc shuffle clobbers one of src operands. // We need additional instructions to keep sources. unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; - int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + - NumOfMoves; + InstructionCost Cost = + NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + + NumOfMoves; return Cost; } -int X86TTIImpl::getInterleavedMemoryOpCost( +InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { |