1 files changed, 153 insertions, 37 deletions
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1820ad959fcb..d75fef7b0171 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -38,7 +38,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
-/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// Calculate the cost of materializing a 64-bit value. This helper
 /// method might only calculate a fraction of a larger immediate. Therefore it
 /// is valid to return a cost of ZERO.
 int AArch64TTIImpl::getIntImmCost(int64_t Val) {
@@ -54,7 +54,7 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) {
   return (64 - LZ + 15) / 16;
 }
 
-/// \brief Calculate the cost of materializing the given constant.
+/// Calculate the cost of materializing the given constant.
 int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
   assert(Ty->isIntegerTy());
 
@@ -277,7 +277,7 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       // same as the second operand. In this case, we will generate a "long"
       // version of the widening instruction.
       if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
-        if (I->getOpcode() == Cast->getOpcode() &&
+        if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
             cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
           return 0;
     }
@@ -493,32 +493,70 @@ int AArch64TTIImpl::getArithmeticInstrCost(
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
 
-  if (ISD == ISD::SDIV &&
-      Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
-      Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
-    // On AArch64, scalar signed division by constants power-of-two are
-    // normally expanded to the sequence ADD + CMP + SELECT + SRA.
-    // The OperandValue properties many not be same as that of previous
-    // operation; conservatively assume OP_None.
-    Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
-                                   TargetTransformInfo::OP_None,
-                                   TargetTransformInfo::OP_None);
-    return Cost;
-  }
-
   switch (ISD) {
   default:
     return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
                                                 Opd1PropInfo, Opd2PropInfo);
+  case ISD::SDIV:
+    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+        Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+      // On AArch64, scalar signed division by constants power-of-two are
+      // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+      // The OperandValue properties many not be same as that of previous
+      // operation; conservatively assume OP_None.
+      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+                                     TargetTransformInfo::OP_None,
+                                     TargetTransformInfo::OP_None);
+      return Cost;
+    }
+    LLVM_FALLTHROUGH;
+  case ISD::UDIV:
+    if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue) {
+      auto VT = TLI->getValueType(DL, Ty);
+      if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
+        // Vector signed division by constant are expanded to the
+        // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
+        // to MULHS + SUB + SRL + ADD + SRL.
+        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
+                                             Opd2Info,
+                                             TargetTransformInfo::OP_None,
+                                             TargetTransformInfo::OP_None);
+        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
+                                             Opd2Info,
+                                             TargetTransformInfo::OP_None,
+                                             TargetTransformInfo::OP_None);
+        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
+                                             Opd2Info,
+                                             TargetTransformInfo::OP_None,
+                                             TargetTransformInfo::OP_None);
+        return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
+      }
+    }
+
+    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+                                          Opd1PropInfo, Opd2PropInfo);
+    if (Ty->isVectorTy()) {
+      // On AArch64, vector divisions are not supported natively and are
+      // expanded into scalar divisions of each pair of elements.
+      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
+                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
+      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
+                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
+      // TODO: if one of the arguments is scalar, then it's not necessary to
+      // double the cost of handling the vector elements.
+      Cost += Cost;
+    }
+    return Cost;
+
   case ISD::ADD:
   case ISD::MUL:
   case ISD::XOR:
@@ -596,14 +634,22 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
-      Ty->getVectorNumElements() < 8) {
-    // We scalarize the loads/stores because there is not v.4b register and we
-    // have to promote the elements to v.4h.
-    unsigned NumVecElts = Ty->getVectorNumElements();
-    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
-    // We generate 2 instructions per vector element.
-    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
+    unsigned ProfitableNumElements;
+    if (Opcode == Instruction::Store)
+      // We use a custom trunc store lowering so v.4b should be profitable.
+      ProfitableNumElements = 4;
+    else
+      // We scalarize the loads because there is not v.4b register and we
+      // have to promote the elements to v.2.
+      ProfitableNumElements = 8;
+
+    if (Ty->getVectorNumElements() < ProfitableNumElements) {
+      unsigned NumVecElts = Ty->getVectorNumElements();
+      unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+      // We generate 2 instructions per vector element.
+      return NumVectorizableInstsToAmortize * NumVecElts * 2;
+    }
   }
 
   return LT.first;
@@ -690,14 +736,14 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   };
 
   int StridedLoads = countStridedLoads(L, SE);
-  DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
-               << " strided loads\n");
+  LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+                    << " strided loads\n");
   // Pick the largest power of 2 unroll count that won't result in too many
   // strided loads.
   if (StridedLoads) {
     UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
-    DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
+                      << UP.MaxCount << '\n');
   }
 }
 
@@ -868,3 +914,73 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
   }
   return false;
 }
+
+int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
+                                               bool IsPairwiseForm) {
+
+  if (IsPairwiseForm)
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+  MVT MTy = LT.second;
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Horizontal adds can use the 'addv' instruction. We model the cost of these
+  // instructions as normal vector adds. This is the only arithmetic vector
+  // reduction operation for which we have an instruction.
+  static const CostTblEntry CostTblNoPairwise[]{
+      {ISD::ADD, MVT::v8i8,  1},
+      {ISD::ADD, MVT::v16i8, 1},
+      {ISD::ADD, MVT::v4i16, 1},
+      {ISD::ADD, MVT::v8i16, 1},
+      {ISD::ADD, MVT::v4i32, 1},
+  };
+
+  if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
+    return LT.first * Entry->Cost;
+
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+}
+
+int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                   Type *SubTp) {
+  if (Kind == TTI::SK_Transpose || Kind == TTI::SK_Select ||
+      Kind == TTI::SK_PermuteSingleSrc) {
+    static const CostTblEntry ShuffleTbl[] = {
+      // Transpose shuffle kinds can be performed with 'trn1/trn2' and
+      // 'zip1/zip2' instructions.
+      { TTI::SK_Transpose, MVT::v8i8,  1 },
+      { TTI::SK_Transpose, MVT::v16i8, 1 },
+      { TTI::SK_Transpose, MVT::v4i16, 1 },
+      { TTI::SK_Transpose, MVT::v8i16, 1 },
+      { TTI::SK_Transpose, MVT::v2i32, 1 },
+      { TTI::SK_Transpose, MVT::v4i32, 1 },
+      { TTI::SK_Transpose, MVT::v2i64, 1 },
+      { TTI::SK_Transpose, MVT::v2f32, 1 },
+      { TTI::SK_Transpose, MVT::v4f32, 1 },
+      { TTI::SK_Transpose, MVT::v2f64, 1 },
+      // Select shuffle kinds.
+      // TODO: handle vXi8/vXi16.
+      { TTI::SK_Select, MVT::v2i32, 1 }, // mov.
+      { TTI::SK_Select, MVT::v4i32, 2 }, // rev+trn (or similar).
+      { TTI::SK_Select, MVT::v2i64, 1 }, // mov.
+      { TTI::SK_Select, MVT::v2f32, 1 }, // mov.
+      { TTI::SK_Select, MVT::v4f32, 2 }, // rev+trn (or similar).
+      { TTI::SK_Select, MVT::v2f64, 1 }, // mov.
+      // PermuteSingleSrc shuffle kinds.
+      // TODO: handle vXi8/vXi16.
+      { TTI::SK_PermuteSingleSrc, MVT::v2i32, 1 }, // mov.
+      { TTI::SK_PermuteSingleSrc, MVT::v4i32, 3 }, // perfectshuffle worst case.
+      { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // mov.
+      { TTI::SK_PermuteSingleSrc, MVT::v2f32, 1 }, // mov.
+      { TTI::SK_PermuteSingleSrc, MVT::v4f32, 3 }, // perfectshuffle worst case.
+      { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // mov.
+    };
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}