diff options
Diffstat (limited to 'lib/Target/PowerPC/PPCTargetTransformInfo.cpp')
-rw-r--r-- | lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 141 |
1 files changed, 100 insertions, 41 deletions
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index e21c2b77f4d7..cd86dabd5abe 100644 --- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -35,7 +35,7 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(Imm, Ty); @@ -64,8 +64,8 @@ unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 4 * TTI::TCC_Basic; } -unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(IID, Idx, Imm, Ty); @@ -98,8 +98,8 @@ unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return PPCTTIImpl::getIntImmCost(Imm, Ty); } -unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); @@ -197,9 +197,20 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { + // On the A2, always unroll aggressively. For QPX unaligned loads, we depend + // on combining the loads generated for consecutive accesses, and failure to + // do so is particularly expensive. This makes it much more likely (compared + // to only using concatenation unrolling). + if (ST->getDarwinDirective() == PPC::DIR_A2) + return true; + return LoopHasReductions; } +bool PPCTTIImpl::enableInterleavedAccessVectorization() { + return true; +} + unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasAltivec() && !ST->hasQPX()) return 0; @@ -246,7 +257,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -unsigned PPCTTIImpl::getArithmeticInstrCost( +int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { @@ -257,24 +268,30 @@ unsigned PPCTTIImpl::getArithmeticInstrCost( Opd1PropInfo, Opd2PropInfo); } -unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). We need one such shuffle instruction for each actual + // register (this is not true for arbitrary shuffles, but is true for the + // structured types of shuffles covered by TTI::ShuffleKind). + return LT.first; } -unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -313,41 +330,83 @@ unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { +int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); - unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - - // VSX loads/stores support unaligned access. - if (ST->hasVSX()) { - if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) - return Cost; - } + int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - bool UnalignedAltivec = - Src->isVectorTy() && - Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && - LT.second.getSizeInBits() == 128 && - Opcode == Instruction::Load; + // Aligned loads and stores are easy. + unsigned SrcBytes = LT.second.getStoreSize(); + if (!SrcBytes || !Alignment || Alignment >= SrcBytes) + return Cost; + + bool IsAltivecType = ST->hasAltivec() && + (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || + LT.second == MVT::v4i32 || LT.second == MVT::v4f32); + bool IsVSXType = ST->hasVSX() && + (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); + bool IsQPXType = ST->hasQPX() && + (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); + + // If we can use the permutation-based load sequence, then this is also + // relatively cheap (not counting loop-invariant instructions): one load plus + // one permute (the last load in a series has extra cost, but we're + // neglecting that here). Note that on the P7, we should do unaligned loads + // for Altivec types using the VSX instructions, but that's more expensive + // than using the permutation-based load sequence. On the P8, that's no + // longer true. + if (Opcode == Instruction::Load && + ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + Alignment >= LT.second.getScalarType().getStoreSize()) + return Cost + LT.first; // Add the cost of the permutations. + + // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the + // P7, unaligned vector loads are more expensive than the permutation-based + // load sequence, so that might be used instead, but regardless, the net cost + // is about the same (not counting loop-invariant instructions). + if (IsVSXType || (ST->hasVSX() && IsAltivecType)) + return Cost; // PPC in general does not support unaligned loads and stores. They'll need // to be decomposed based on the alignment factor. - unsigned SrcBytes = LT.second.getStoreSize(); - if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { - Cost += LT.first*(SrcBytes/Alignment-1); - - // For a vector type, there is also scalarization overhead (only for - // stores, loads are expanded using the vector-load + permutation sequence, - // which is much less expensive). - if (Src->isVectorTy() && Opcode == Instruction::Store) - for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) - Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); - } + + // Add the cost of each scalar load or store. + Cost += LT.first*(SrcBytes/Alignment-1); + + // For a vector type, there is also scalarization overhead (only for + // stores, loads are expanded using the vector-load + permutation sequence, + // which is much less expensive). + if (Src->isVectorTy() && Opcode == Instruction::Store) + for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + + return Cost; +} + +int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(isa<VectorType>(VecTy) && + "Expect a vector type for interleaved memory op"); + + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); + + // Firstly, the cost of load/store operation. + int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). For each result vector, we need one shuffle per incoming + // vector (except that the first shuffle can take two incoming vectors + // because it does not need to take itself). + Cost += Factor*(LT.first-1); return Cost; } |