aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp365
1 files changed, 280 insertions, 85 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 542a5f006c0f..7b8a79640bb2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -15,40 +15,12 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetTransformInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
+#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachineValueType.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include <algorithm>
-#include <cassert>
-#include <limits>
-#include <utility>
+#include "llvm/Support/KnownBits.h"
using namespace llvm;
@@ -82,7 +54,25 @@ static cl::opt<bool> UseLegacyDA(
static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
"amdgpu-unroll-max-block-to-analyze",
cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
- cl::init(20), cl::Hidden);
+ cl::init(32), cl::Hidden);
+
+static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
+ cl::Hidden, cl::init(4000),
+ cl::desc("Cost of alloca argument"));
+
+// If the amount of scratch memory to eliminate exceeds our ability to allocate
+// it into registers we gain nothing by aggressively inlining functions for that
+// heuristic.
+static cl::opt<unsigned>
+ ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
+ cl::init(256),
+ cl::desc("Maximum alloca size to use for inline cost"));
+
+// Inliner constraint to achieve reasonable compilation time.
+static cl::opt<size_t> InlineMaxBB(
+ "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
+ cl::desc("Maximum number of BBs allowed in a function after inlining"
+ " (compile time constraint)"));
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
unsigned Depth = 0) {
@@ -103,6 +93,12 @@ static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
return false;
}
+AMDGPUTTIImpl::AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ TargetTriple(TM->getTargetTriple()),
+ ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()) {}
+
void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
const Function &F = *L->getHeader()->getParent();
@@ -116,6 +112,26 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
const unsigned MaxAlloca = (256 - 16) * 4;
unsigned ThresholdPrivate = UnrollThresholdPrivate;
unsigned ThresholdLocal = UnrollThresholdLocal;
+
+ // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
+ // provided threshold value as the default for Threshold
+ if (MDNode *LoopUnrollThreshold =
+ findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
+ if (LoopUnrollThreshold->getNumOperands() == 2) {
+ ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
+ LoopUnrollThreshold->getOperand(1));
+ if (MetaThresholdValue) {
+ // We will also use the supplied value for PartialThreshold for now.
+ // We may introduce additional metadata if it becomes necessary in the
+ // future.
+ UP.Threshold = MetaThresholdValue->getSExtValue();
+ UP.PartialThreshold = UP.Threshold;
+ ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
+ ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
+ }
+ }
+ }
+
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
@@ -169,7 +185,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
const Value *Ptr = GEP->getPointerOperand();
const AllocaInst *Alloca =
- dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+ dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
if (!Alloca || !Alloca->isStaticAlloca())
continue;
Type *Ty = Alloca->getAllocatedType();
@@ -231,7 +247,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// If we got a GEP in a small BB from inner loop then increase max trip
// count to analyze for better estimation cost in unroll
- if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
+ if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
UP.MaxIterationsCountToAnalyze = 32;
}
}
@@ -240,6 +256,41 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) {
BaseT::getPeelingPreferences(L, SE, PP);
}
+
+const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
+ // Codegen control options which don't matter.
+ AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
+ AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
+ AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
+ AMDGPU::FeatureUnalignedAccessMode,
+
+ AMDGPU::FeatureAutoWaitcntBeforeBarrier,
+
+ // Property of the kernel/environment which can't actually differ.
+ AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
+ AMDGPU::FeatureTrapHandler,
+
+ // The default assumption needs to be ecc is enabled, but no directly
+ // exposed operations depend on it, so it can be safely inlined.
+ AMDGPU::FeatureSRAMECC,
+
+ // Perf-tuning features
+ AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
+
+GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()), CommonTTI(TM, F),
+ IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
+ MaxVGPRs(ST->getMaxNumVGPRs(
+ std::max(ST->getWavesPerEU(F).first,
+ ST->getWavesPerEUForWorkGroup(
+ ST->getFlatWorkGroupSizes(F).second)))) {
+ AMDGPU::SIModeRegisterDefaults Mode(F);
+ HasFP32Denormals = Mode.allFP32Denormals();
+ HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
+}
+
unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
// The concept of vector registers doesn't really exist. Some packed vector
// operations operate on the normal 32-bit registers.
@@ -267,6 +318,12 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
return 32;
}
+unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+ if (Opcode == Instruction::Load || Opcode == Instruction::Store)
+ return 32 * 4 / ElemWidth;
+ return (ElemWidth == 16 && ST->has16BitInsts()) ? 2 : 1;
+}
+
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
unsigned ChainSizeInBytes,
VectorType *VecTy) const {
@@ -451,9 +508,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: We're having to query the throughput cost so that the basic
// implementation tries to generate legalize and scalarization costs. Maybe
// we could hoist the scalarization code here?
- return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
- Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ if (CostKind != TTI::TCK_CodeSize)
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
+ Opd1Info, Opd2Info, Opd1PropInfo,
+ Opd2PropInfo, Args, CxtI);
+ // Scalarization
+
+ // Check if any of the operands are vector operands.
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+
+ bool IsFloat = Ty->isFPOrFPVectorTy();
+ // Assume that floating point arithmetic operations cost twice as much as
+ // integer operations.
+ unsigned OpCost = (IsFloat ? 2 : 1);
+
+ if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
+ // The operation is legal. Assume it costs 1.
+ // TODO: Once we have extract/insert subvector cost we need to use them.
+ return LT.first * OpCost;
+ }
+
+ if (!TLI->isOperationExpand(ISD, LT.second)) {
+ // If the operation is custom lowered, then assume that the code is twice
+ // as expensive.
+ return LT.first * 2 * OpCost;
+ }
+
+ // Else, assume that we need to scalarize this op.
+ // TODO: If one of the types get legalized by splitting, handle this
+ // similarly to what getCastInstrCost() does.
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
+ unsigned Cost = getArithmeticInstrCost(
+ Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo, Args, CxtI);
+ // Return the cost of multiple scalar invocation plus the cost of
+ // inserting and extracting the values.
+ return getScalarizationOverhead(VTy, Args) + Num * Cost;
+ }
+
+ // We don't know anything about this scalar instruction.
+ return OpCost;
}
// Legalize the type.
@@ -472,7 +570,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
case ISD::SRL:
case ISD::SRA:
if (SLT == MVT::i64)
- return get64BitInstrCost() * LT.first * NElts;
+ return get64BitInstrCost(CostKind) * LT.first * NElts;
if (ST->has16BitInsts() && SLT == MVT::i16)
NElts = (NElts + 1) / 2;
@@ -494,7 +592,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
return LT.first * NElts * getFullRateInstrCost();
case ISD::MUL: {
- const int QuarterRateCost = getQuarterRateInstrCost();
+ const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
if (SLT == MVT::i64) {
const int FullRateCost = getFullRateInstrCost();
return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
@@ -506,11 +604,32 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// i32
return QuarterRateCost * NElts * LT.first;
}
+ case ISD::FMUL:
+ // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
+ // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
+ // fused operation.
+ if (CxtI && CxtI->hasOneUse())
+ if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
+ const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
+ if (OPC == ISD::FADD || OPC == ISD::FSUB) {
+ if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
+ return TargetTransformInfo::TCC_Free;
+ if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
+ return TargetTransformInfo::TCC_Free;
+
+ // Estimate all types may be fused with contract/unsafe flags
+ const TargetOptions &Options = TLI->getTargetMachine().Options;
+ if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+ Options.UnsafeFPMath ||
+ (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
+ return TargetTransformInfo::TCC_Free;
+ }
+ }
+ LLVM_FALLTHROUGH;
case ISD::FADD:
case ISD::FSUB:
- case ISD::FMUL:
if (SLT == MVT::f64)
- return LT.first * NElts * get64BitInstrCost();
+ return LT.first * NElts * get64BitInstrCost(CostKind);
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
@@ -523,7 +642,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: frem should be handled separately. The fdiv in it is most of it,
// but the current lowering is also not entirely correct.
if (SLT == MVT::f64) {
- int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
+ int Cost = 7 * get64BitInstrCost(CostKind) +
+ getQuarterRateInstrCost(CostKind) +
+ 3 * getHalfRateInstrCost(CostKind);
// Add cost of workaround.
if (!ST->hasUsableDivScaleConditionOutput())
Cost += 3 * getFullRateInstrCost();
@@ -535,7 +656,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// TODO: This is more complicated, unsafe flags etc.
if ((SLT == MVT::f32 && !HasFP32Denormals) ||
(SLT == MVT::f16 && ST->has16BitInsts())) {
- return LT.first * getQuarterRateInstrCost() * NElts;
+ return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
}
}
@@ -545,12 +666,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// f32 fmul
// v_cvt_f16_f32
// f16 div_fixup
- int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
+ int Cost =
+ 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
return LT.first * Cost * NElts;
}
if (SLT == MVT::f32 || SLT == MVT::f16) {
- int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
+ // 4 more v_cvt_* insts without f16 insts support
+ int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
+ 1 * getQuarterRateInstrCost(CostKind);
if (!HasFP32Denormals) {
// FP mode switches.
@@ -568,18 +692,21 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
break;
}
- return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
- Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo, Args, CxtI);
}
-// Return true if there's a potential benefit from using v2f16 instructions for
-// an intrinsic, even if it requires nontrivial legalization.
+// Return true if there's a potential benefit from using v2f16/v2i16
+// instructions for an intrinsic, even if it requires nontrivial legalization.
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::fma: // TODO: fmuladd
// There's a small benefit to using vector ops in the legalized code.
case Intrinsic::round:
+ case Intrinsic::uadd_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::ssub_sat:
return true;
default:
return false;
@@ -597,7 +724,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
Type *RetTy = ICA.getReturnType();
EVT OrigTy = TLI->getValueType(DL, RetTy);
if (!OrigTy.isSimple()) {
- return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+ if (CostKind != TTI::TCK_CodeSize)
+ return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+ // TODO: Combine these two logic paths.
+ if (ICA.isTypeBasedOnly())
+ return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
+ Type *RetTy = ICA.getReturnType();
+ unsigned VF = ICA.getVectorFactor().getFixedValue();
+ unsigned RetVF =
+ (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
+ : 1);
+ assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+ const IntrinsicInst *I = ICA.getInst();
+ const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
+ FastMathFlags FMF = ICA.getFlags();
+ // Assume that we need to scalarize this intrinsic.
+ SmallVector<Type *, 4> Types;
+ for (const Value *Op : Args) {
+ Type *OpTy = Op->getType();
+ assert(VF == 1 || !OpTy->isVectorTy());
+ Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
+ }
+
+ if (VF > 1 && !RetTy->isVoidTy())
+ RetTy = FixedVectorType::get(RetTy, VF);
+
+ // Compute the scalarization overhead based on Args for a vector
+ // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+ // CostModel will pass a vector RetTy and VF is 1.
+ unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
+ if (RetVF > 1 || VF > 1) {
+ ScalarizationCost = 0;
+ if (!RetTy->isVoidTy())
+ ScalarizationCost +=
+ getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
+ ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+ }
+
+ IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
+ ScalarizationCost, I);
+ return getIntrinsicInstrCost(Attrs, CostKind);
}
// Legalize the type.
@@ -609,16 +777,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
if (SLT == MVT::f64)
- return LT.first * NElts * get64BitInstrCost();
+ return LT.first * NElts * get64BitInstrCost(CostKind);
if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
// TODO: Get more refined intrinsic costs?
- unsigned InstRate = getQuarterRateInstrCost();
+ unsigned InstRate = getQuarterRateInstrCost(CostKind);
if (ICA.getID() == Intrinsic::fma) {
- InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
- : getQuarterRateInstrCost();
+ InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
+ : getQuarterRateInstrCost(CostKind);
}
return LT.first * NElts * InstRate;
@@ -669,7 +837,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
CostKind);
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
- return LT.first * getHalfRateInstrCost();
+ return LT.first * getHalfRateInstrCost(CostKind);
}
int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -697,32 +865,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
}
}
-static bool isArgPassedInSGPR(const Argument *A) {
- const Function *F = A->getParent();
-
- // Arguments to compute shaders are never a source of divergence.
- CallingConv::ID CC = F->getCallingConv();
- switch (CC) {
- case CallingConv::AMDGPU_KERNEL:
- case CallingConv::SPIR_KERNEL:
- return true;
- case CallingConv::AMDGPU_VS:
- case CallingConv::AMDGPU_LS:
- case CallingConv::AMDGPU_HS:
- case CallingConv::AMDGPU_ES:
- case CallingConv::AMDGPU_GS:
- case CallingConv::AMDGPU_PS:
- case CallingConv::AMDGPU_CS:
- // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
- // Everything else is in VGPRs.
- return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
- F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
- default:
- // TODO: Should calls support inreg for SGPR inputs?
- return false;
- }
-}
-
/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
/// this is analyzing the collective result of all output registers. Otherwise,
/// this is only querying a specific result index if this returns multiple
@@ -779,7 +921,7 @@ bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
/// different across workitems in a wavefront.
bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
if (const Argument *A = dyn_cast<Argument>(V))
- return !isArgPassedInSGPR(A);
+ return !AMDGPU::isArgPassedInSGPR(A);
// Loads from the private and flat address spaces are divergent, because
// threads can execute the load instruction with the same inputs and get
@@ -921,7 +1063,10 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Type *MaskTy = MaskOp->getType();
bool DoTruncate = false;
- if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
+
+ const GCNTargetMachine &TM =
+ static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
+ if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
// All valid 64-bit to 32-bit casts work by chopping off the high
// bits. Any masking only clearing the low bits will also apply in the new
// address space.
@@ -993,7 +1138,47 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
// no way to support merge for backend defined attributes.
AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
- return CallerMode.isInlineCompatible(CalleeMode);
+ if (!CallerMode.isInlineCompatible(CalleeMode))
+ return false;
+
+ // Hack to make compile times reasonable.
+ if (InlineMaxBB && !Callee->hasFnAttribute(Attribute::InlineHint)) {
+ // Single BB does not increase total BB amount, thus subtract 1.
+ size_t BBSize = Caller->size() + Callee->size() - 1;
+ return BBSize <= InlineMaxBB;
+ }
+
+ return true;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+ // If we have a pointer to private array passed into a function
+ // it will not be optimized out, leaving scratch usage.
+ // Increase the inline threshold to allow inlining in this case.
+ uint64_t AllocaSize = 0;
+ SmallPtrSet<const AllocaInst *, 8> AIVisited;
+ for (Value *PtrArg : CB->args()) {
+ PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
+ if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+ Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
+ continue;
+
+ PtrArg = getUnderlyingObject(PtrArg);
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
+ if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
+ continue;
+ AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
+ // If the amount of stack memory is excessive we will not be able
+ // to get rid of the scratch anyway, bail out.
+ if (AllocaSize > ArgAllocaCutoff) {
+ AllocaSize = 0;
+ break;
+ }
+ }
+ }
+ if (AllocaSize)
+ return ArgAllocaCost;
+ return 0;
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
@@ -1006,6 +1191,16 @@ void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
CommonTTI.getPeelingPreferences(L, SE, PP);
}
+int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
+ return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
+ : getQuarterRateInstrCost(CostKind);
+}
+
+R600TTIImpl::R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
+
unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}