diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 365 |
1 files changed, 280 insertions, 85 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 542a5f006c0f..7b8a79640bb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -15,40 +15,12 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetTransformInfo.h" -#include "AMDGPUSubtarget.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/STLExtras.h" +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/ISDOpcodes.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/Argument.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Value.h" -#include "llvm/MC/SubtargetFeature.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachineValueType.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" -#include <algorithm> -#include <cassert> -#include <limits> -#include <utility> +#include "llvm/Support/KnownBits.h" using namespace llvm; @@ -82,7 +54,25 @@ static cl::opt<bool> UseLegacyDA( static cl::opt<unsigned> UnrollMaxBlockToAnalyze( "amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), - cl::init(20), cl::Hidden); + cl::init(32), cl::Hidden); + +static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost", + cl::Hidden, cl::init(4000), + cl::desc("Cost of alloca argument")); + +// If the amount of scratch memory to eliminate exceeds our ability to allocate +// it into registers we gain nothing by aggressively inlining functions for that +// heuristic. +static cl::opt<unsigned> + ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, + cl::init(256), + cl::desc("Maximum alloca size to use for inline cost")); + +// Inliner constraint to achieve reasonable compilation time. +static cl::opt<size_t> InlineMaxBB( + "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), + cl::desc("Maximum number of BBs allowed in a function after inlining" + " (compile time constraint)")); static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth = 0) { @@ -103,6 +93,12 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, return false; } +AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + TargetTriple(TM->getTargetTriple()), + ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), + TLI(ST->getTargetLowering()) {} + void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP) { const Function &F = *L->getHeader()->getParent(); @@ -116,6 +112,26 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, const unsigned MaxAlloca = (256 - 16) * 4; unsigned ThresholdPrivate = UnrollThresholdPrivate; unsigned ThresholdLocal = UnrollThresholdLocal; + + // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the + // provided threshold value as the default for Threshold + if (MDNode *LoopUnrollThreshold = + findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) { + if (LoopUnrollThreshold->getNumOperands() == 2) { + ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>( + LoopUnrollThreshold->getOperand(1)); + if (MetaThresholdValue) { + // We will also use the supplied value for PartialThreshold for now. + // We may introduce additional metadata if it becomes necessary in the + // future. + UP.Threshold = MetaThresholdValue->getSExtValue(); + UP.PartialThreshold = UP.Threshold; + ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold); + ThresholdLocal = std::min(ThresholdLocal, UP.Threshold); + } + } + } + unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getModule()->getDataLayout(); @@ -169,7 +185,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, if (AS == AMDGPUAS::PRIVATE_ADDRESS) { const Value *Ptr = GEP->getPointerOperand(); const AllocaInst *Alloca = - dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL)); + dyn_cast<AllocaInst>(getUnderlyingObject(Ptr)); if (!Alloca || !Alloca->isStaticAlloca()) continue; Type *Ty = Alloca->getAllocatedType(); @@ -231,7 +247,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, // If we got a GEP in a small BB from inner loop then increase max trip // count to analyze for better estimation cost in unroll - if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze) + if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze) UP.MaxIterationsCountToAnalyze = 32; } } @@ -240,6 +256,41 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) { BaseT::getPeelingPreferences(L, SE, PP); } + +const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = { + // Codegen control options which don't matter. + AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler, + AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal, + AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess, + AMDGPU::FeatureUnalignedAccessMode, + + AMDGPU::FeatureAutoWaitcntBeforeBarrier, + + // Property of the kernel/environment which can't actually differ. + AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK, + AMDGPU::FeatureTrapHandler, + + // The default assumption needs to be ecc is enabled, but no directly + // exposed operations depend on it, so it can be safely inlined. + AMDGPU::FeatureSRAMECC, + + // Perf-tuning features + AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops}; + +GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))), + TLI(ST->getTargetLowering()), CommonTTI(TM, F), + IsGraphics(AMDGPU::isGraphics(F.getCallingConv())), + MaxVGPRs(ST->getMaxNumVGPRs( + std::max(ST->getWavesPerEU(F).first, + ST->getWavesPerEUForWorkGroup( + ST->getFlatWorkGroupSizes(F).second)))) { + AMDGPU::SIModeRegisterDefaults Mode(F); + HasFP32Denormals = Mode.allFP32Denormals(); + HasFP64FP16Denormals = Mode.allFP64FP16Denormals(); +} + unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { // The concept of vector registers doesn't really exist. Some packed vector // operations operate on the normal 32-bit registers. @@ -267,6 +318,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const { return 32; } +unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { + if (Opcode == Instruction::Load || Opcode == Instruction::Store) + return 32 * 4 / ElemWidth; + return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1; +} + unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const { @@ -451,9 +508,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // FIXME: We're having to query the throughput cost so that the basic // implementation tries to generate legalize and scalarization costs. Maybe // we could hoist the scalarization code here? - return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, - Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + if (CostKind != TTI::TCK_CodeSize) + return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, + Opd1Info, Opd2Info, Opd1PropInfo, + Opd2PropInfo, Args, CxtI); + // Scalarization + + // Check if any of the operands are vector operands. + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + + bool IsFloat = Ty->isFPOrFPVectorTy(); + // Assume that floating point arithmetic operations cost twice as much as + // integer operations. + unsigned OpCost = (IsFloat ? 2 : 1); + + if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { + // The operation is legal. Assume it costs 1. + // TODO: Once we have extract/insert subvector cost we need to use them. + return LT.first * OpCost; + } + + if (!TLI->isOperationExpand(ISD, LT.second)) { + // If the operation is custom lowered, then assume that the code is twice + // as expensive. + return LT.first * 2 * OpCost; + } + + // Else, assume that we need to scalarize this op. + // TODO: If one of the types get legalized by splitting, handle this + // similarly to what getCastInstrCost() does. + if (auto *VTy = dyn_cast<VectorType>(Ty)) { + unsigned Num = cast<FixedVectorType>(VTy)->getNumElements(); + unsigned Cost = getArithmeticInstrCost( + Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo, Args, CxtI); + // Return the cost of multiple scalar invocation plus the cost of + // inserting and extracting the values. + return getScalarizationOverhead(VTy, Args) + Num * Cost; + } + + // We don't know anything about this scalar instruction. + return OpCost; } // Legalize the type. @@ -472,7 +570,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, case ISD::SRL: case ISD::SRA: if (SLT == MVT::i64) - return get64BitInstrCost() * LT.first * NElts; + return get64BitInstrCost(CostKind) * LT.first * NElts; if (ST->has16BitInsts() && SLT == MVT::i16) NElts = (NElts + 1) / 2; @@ -494,7 +592,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, return LT.first * NElts * getFullRateInstrCost(); case ISD::MUL: { - const int QuarterRateCost = getQuarterRateInstrCost(); + const int QuarterRateCost = getQuarterRateInstrCost(CostKind); if (SLT == MVT::i64) { const int FullRateCost = getFullRateInstrCost(); return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; @@ -506,11 +604,32 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // i32 return QuarterRateCost * NElts * LT.first; } + case ISD::FMUL: + // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for + // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole + // fused operation. + if (CxtI && CxtI->hasOneUse()) + if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) { + const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode()); + if (OPC == ISD::FADD || OPC == ISD::FSUB) { + if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals) + return TargetTransformInfo::TCC_Free; + if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals) + return TargetTransformInfo::TCC_Free; + + // Estimate all types may be fused with contract/unsafe flags + const TargetOptions &Options = TLI->getTargetMachine().Options; + if (Options.AllowFPOpFusion == FPOpFusion::Fast || + Options.UnsafeFPMath || + (FAdd->hasAllowContract() && CxtI->hasAllowContract())) + return TargetTransformInfo::TCC_Free; + } + } + LLVM_FALLTHROUGH; case ISD::FADD: case ISD::FSUB: - case ISD::FMUL: if (SLT == MVT::f64) - return LT.first * NElts * get64BitInstrCost(); + return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; @@ -523,7 +642,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // FIXME: frem should be handled separately. The fdiv in it is most of it, // but the current lowering is also not entirely correct. if (SLT == MVT::f64) { - int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); + int Cost = 7 * get64BitInstrCost(CostKind) + + getQuarterRateInstrCost(CostKind) + + 3 * getHalfRateInstrCost(CostKind); // Add cost of workaround. if (!ST->hasUsableDivScaleConditionOutput()) Cost += 3 * getFullRateInstrCost(); @@ -535,7 +656,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // TODO: This is more complicated, unsafe flags etc. if ((SLT == MVT::f32 && !HasFP32Denormals) || (SLT == MVT::f16 && ST->has16BitInsts())) { - return LT.first * getQuarterRateInstrCost() * NElts; + return LT.first * getQuarterRateInstrCost(CostKind) * NElts; } } @@ -545,12 +666,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, // f32 fmul // v_cvt_f16_f32 // f16 div_fixup - int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(); + int Cost = + 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind); return LT.first * Cost * NElts; } if (SLT == MVT::f32 || SLT == MVT::f16) { - int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); + // 4 more v_cvt_* insts without f16 insts support + int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() + + 1 * getQuarterRateInstrCost(CostKind); if (!HasFP32Denormals) { // FP mode switches. @@ -568,18 +692,21 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, break; } - return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, - Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo, Args, CxtI); } -// Return true if there's a potential benefit from using v2f16 instructions for -// an intrinsic, even if it requires nontrivial legalization. +// Return true if there's a potential benefit from using v2f16/v2i16 +// instructions for an intrinsic, even if it requires nontrivial legalization. static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { switch (ID) { case Intrinsic::fma: // TODO: fmuladd // There's a small benefit to using vector ops in the legalized code. case Intrinsic::round: + case Intrinsic::uadd_sat: + case Intrinsic::usub_sat: + case Intrinsic::sadd_sat: + case Intrinsic::ssub_sat: return true; default: return false; @@ -597,7 +724,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, Type *RetTy = ICA.getReturnType(); EVT OrigTy = TLI->getValueType(DL, RetTy); if (!OrigTy.isSimple()) { - return BaseT::getIntrinsicInstrCost(ICA, CostKind); + if (CostKind != TTI::TCK_CodeSize) + return BaseT::getIntrinsicInstrCost(ICA, CostKind); + + // TODO: Combine these two logic paths. + if (ICA.isTypeBasedOnly()) + return getTypeBasedIntrinsicInstrCost(ICA, CostKind); + + Type *RetTy = ICA.getReturnType(); + unsigned VF = ICA.getVectorFactor().getFixedValue(); + unsigned RetVF = + (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements() + : 1); + assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); + const IntrinsicInst *I = ICA.getInst(); + const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); + FastMathFlags FMF = ICA.getFlags(); + // Assume that we need to scalarize this intrinsic. + SmallVector<Type *, 4> Types; + for (const Value *Op : Args) { + Type *OpTy = Op->getType(); + assert(VF == 1 || !OpTy->isVectorTy()); + Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF)); + } + + if (VF > 1 && !RetTy->isVoidTy()) + RetTy = FixedVectorType::get(RetTy, VF); + + // Compute the scalarization overhead based on Args for a vector + // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while + // CostModel will pass a vector RetTy and VF is 1. + unsigned ScalarizationCost = std::numeric_limits<unsigned>::max(); + if (RetVF > 1 || VF > 1) { + ScalarizationCost = 0; + if (!RetTy->isVoidTy()) + ScalarizationCost += + getScalarizationOverhead(cast<VectorType>(RetTy), true, false); + ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + } + + IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF, + ScalarizationCost, I); + return getIntrinsicInstrCost(Attrs, CostKind); } // Legalize the type. @@ -609,16 +777,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; if (SLT == MVT::f64) - return LT.first * NElts * get64BitInstrCost(); + return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; // TODO: Get more refined intrinsic costs? - unsigned InstRate = getQuarterRateInstrCost(); + unsigned InstRate = getQuarterRateInstrCost(CostKind); if (ICA.getID() == Intrinsic::fma) { - InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost() - : getQuarterRateInstrCost(); + InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); } return LT.first * NElts * InstRate; @@ -669,7 +837,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, CostKind); std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - return LT.first * getHalfRateInstrCost(); + return LT.first * getHalfRateInstrCost(CostKind); } int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, @@ -697,32 +865,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, } } -static bool isArgPassedInSGPR(const Argument *A) { - const Function *F = A->getParent(); - - // Arguments to compute shaders are never a source of divergence. - CallingConv::ID CC = F->getCallingConv(); - switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - return true; - case CallingConv::AMDGPU_VS: - case CallingConv::AMDGPU_LS: - case CallingConv::AMDGPU_HS: - case CallingConv::AMDGPU_ES: - case CallingConv::AMDGPU_GS: - case CallingConv::AMDGPU_PS: - case CallingConv::AMDGPU_CS: - // For non-compute shaders, SGPR inputs are marked with either inreg or byval. - // Everything else is in VGPRs. - return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) || - F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal); - default: - // TODO: Should calls support inreg for SGPR inputs? - return false; - } -} - /// Analyze if the results of inline asm are divergent. If \p Indices is empty, /// this is analyzing the collective result of all output registers. Otherwise, /// this is only querying a specific result index if this returns multiple @@ -779,7 +921,7 @@ bool GCNTTIImpl::useGPUDivergenceAnalysis() const { /// different across workitems in a wavefront. bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const { if (const Argument *A = dyn_cast<Argument>(V)) - return !isArgPassedInSGPR(A); + return !AMDGPU::isArgPassedInSGPR(A); // Loads from the private and flat address spaces are divergent, because // threads can execute the load instruction with the same inputs and get @@ -921,7 +1063,10 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Type *MaskTy = MaskOp->getType(); bool DoTruncate = false; - if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) { + + const GCNTargetMachine &TM = + static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine()); + if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) { // All valid 64-bit to 32-bit casts work by chopping off the high // bits. Any masking only clearing the low bits will also apply in the new // address space. @@ -993,7 +1138,47 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller, // no way to support merge for backend defined attributes. AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); - return CallerMode.isInlineCompatible(CalleeMode); + if (!CallerMode.isInlineCompatible(CalleeMode)) + return false; + + // Hack to make compile times reasonable. + if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) { + // Single BB does not increase total BB amount, thus subtract 1. + size_t BBSize = Caller->size() + Callee->size() - 1; + return BBSize <= InlineMaxBB; + } + + return true; +} + +unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const { + // If we have a pointer to private array passed into a function + // it will not be optimized out, leaving scratch usage. + // Increase the inline threshold to allow inlining in this case. + uint64_t AllocaSize = 0; + SmallPtrSet<const AllocaInst *, 8> AIVisited; + for (Value *PtrArg : CB->args()) { + PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType()); + if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS && + Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) + continue; + + PtrArg = getUnderlyingObject(PtrArg); + if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { + if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) + continue; + AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); + // If the amount of stack memory is excessive we will not be able + // to get rid of the scratch anyway, bail out. + if (AllocaSize > ArgAllocaCutoff) { + AllocaSize = 0; + break; + } + } + } + if (AllocaSize) + return ArgAllocaCost; + return 0; } void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, @@ -1006,6 +1191,16 @@ void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, CommonTTI.getPeelingPreferences(L, SE, PP); } +int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const { + return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind) + : getQuarterRateInstrCost(CostKind); +} + +R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))), + TLI(ST->getTargetLowering()), CommonTTI(TM, F) {} + unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } |