aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp370
1 files changed, 296 insertions, 74 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 519ae5cc748d..3e53f52c689f 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -14,19 +14,23 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetMachine.h"
-#include "AMDGPUTargetObjectFile.h"
#include "AMDGPU.h"
+#include "AMDGPUCallLowering.h"
+#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineScheduler.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
+
#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Verifier.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/IR/LegacyPassManager.h"
@@ -34,10 +38,35 @@
#include "llvm/Support/raw_os_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Scalar.h"
-#include <llvm/CodeGen/Passes.h>
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Vectorize.h"
using namespace llvm;
+static cl::opt<bool> EnableR600StructurizeCFG(
+ "r600-ir-structurize",
+ cl::desc("Use StructurizeCFG IR pass"),
+ cl::init(true));
+
+static cl::opt<bool> EnableSROA(
+ "amdgpu-sroa",
+ cl::desc("Run SROA after promote alloca pass"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+static cl::opt<bool> EnableR600IfConvert(
+ "r600-if-convert",
+ cl::desc("Use if conversion pass"),
+ cl::ReallyHidden,
+ cl::init(true));
+
+// Option to disable vectorizer for tests.
+static cl::opt<bool> EnableLoadStoreVectorizer(
+ "amdgpu-load-store-vectorizer",
+ cl::desc("Enable load store vectorizer"),
+ cl::init(false),
+ cl::Hidden);
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget);
@@ -47,17 +76,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSILowerI1CopiesPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFoldOperandsPass(*PR);
- initializeSIFixSGPRLiveRangesPass(*PR);
+ initializeSIShrinkInstructionsPass(*PR);
initializeSIFixControlFlowLiveIntervalsPass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
+ initializeAMDGPUPromoteAllocaPass(*PR);
+ initializeAMDGPUCodeGenPreparePass(*PR);
+ initializeSIAnnotateControlFlowPass(*PR);
+ initializeSIDebuggerInsertNopsPass(*PR);
+ initializeSIInsertWaitsPass(*PR);
+ initializeSIWholeQuadModePass(*PR);
+ initializeSILowerControlFlowPass(*PR);
+ initializeSIDebuggerInsertNopsPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
- if (TT.getOS() == Triple::AMDHSA)
- return make_unique<AMDGPUHSATargetObjectFile>();
-
return make_unique<AMDGPUTargetObjectFile>();
}
@@ -73,60 +107,156 @@ static MachineSchedRegistry
SISchedRegistry("si", "Run SI's custom scheduler",
createSIMachineScheduler);
-static std::string computeDataLayout(const Triple &TT) {
- std::string Ret = "e-p:32:32";
-
- if (TT.getArch() == Triple::amdgcn) {
- // 32-bit private, local, and region pointers. 64-bit global and constant.
- Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
+static StringRef computeDataLayout(const Triple &TT) {
+ if (TT.getArch() == Triple::r600) {
+ // 32-bit pointers.
+ return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
}
- Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
- "-v512:512-v1024:1024-v2048:2048-n32:64";
+ // 32-bit private, local, and region pointers. 64-bit global, constant and
+ // flat.
+ return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+ "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+ "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+}
+
+LLVM_READNONE
+static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
+ if (!GPU.empty())
+ return GPU;
- return Ret;
+ // HSA only supports CI+, so change the default GPU to a CI for HSA.
+ if (TT.getArch() == Triple::amdgcn)
+ return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
+
+ return "r600";
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+ // The AMDGPU toolchain only supports generating shared objects, so we
+ // must always use PIC.
+ return Reloc::PIC_;
}
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
- TargetOptions Options, Reloc::Model RM,
+ TargetOptions Options,
+ Optional<Reloc::Model> RM,
CodeModel::Model CM,
CodeGenOpt::Level OptLevel)
- : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM,
- OptLevel),
- TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this),
- IntrinsicInfo() {
+ : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
+ FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
+ TLOF(createTLOF(getTargetTriple())),
+ IntrinsicInfo() {
setRequiresStructuredCFG(true);
initAsmInfo();
}
AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+ Attribute GPUAttr = F.getFnAttribute("target-cpu");
+ return GPUAttr.hasAttribute(Attribute::None) ?
+ getTargetCPU() : GPUAttr.getValueAsString();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ return FSAttr.hasAttribute(Attribute::None) ?
+ getTargetFeatureString() :
+ FSAttr.getValueAsString();
+}
+
//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)
//===----------------------------------------------------------------------===//
R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
- StringRef FS, StringRef CPU,
- TargetOptions Options, Reloc::Model RM,
+ StringRef CPU, StringRef FS,
+ TargetOptions Options,
+ Optional<Reloc::Model> RM,
CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const R600Subtarget *R600TargetMachine::getSubtargetImpl(
+ const Function &F) const {
+ StringRef GPU = getGPUName(F);
+ StringRef FS = getFeatureString(F);
+
+ SmallString<128> SubtargetKey(GPU);
+ SubtargetKey.append(FS);
+
+ auto &I = SubtargetMap[SubtargetKey];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+ }
+
+ return I.get();
+}
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct SIGISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+ const AMDGPUCallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+};
+} // End anonymous namespace.
+#endif
+
GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
- StringRef FS, StringRef CPU,
- TargetOptions Options, Reloc::Model RM,
+ StringRef CPU, StringRef FS,
+ TargetOptions Options,
+ Optional<Reloc::Model> RM,
CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {}
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+ StringRef GPU = getGPUName(F);
+ StringRef FS = getFeatureString(F);
+
+ SmallString<128> SubtargetKey(GPU);
+ SubtargetKey.append(FS);
+
+ auto &I = SubtargetMap[SubtargetKey];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+ GISel->CallLoweringInfo.reset(
+ new AMDGPUCallLowering(*I->getTargetLowering()));
+#endif
+
+ I->setGISelAccessor(*GISel);
+ }
+
+ return I.get();
+}
//===----------------------------------------------------------------------===//
// AMDGPU Pass Setup
//===----------------------------------------------------------------------===//
namespace {
+
class AMDGPUPassConfig : public TargetPassConfig {
public:
AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM)
@@ -142,16 +272,8 @@ public:
return getTM<AMDGPUTargetMachine>();
}
- ScheduleDAGInstrs *
- createMachineScheduler(MachineSchedContext *C) const override {
- const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
- if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
- return createR600MachineScheduler(C);
- else if (ST.enableSIScheduler())
- return createSIMachineScheduler(C);
- return nullptr;
- }
-
+ void addEarlyCSEOrGVNPass();
+ void addStraightLineScalarOptimizationPasses();
void addIRPasses() override;
void addCodeGenPrepare() override;
bool addPreISel() override;
@@ -159,27 +281,44 @@ public:
bool addGCPasses() override;
};
-class R600PassConfig : public AMDGPUPassConfig {
+class R600PassConfig final : public AMDGPUPassConfig {
public:
R600PassConfig(TargetMachine *TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) { }
+ ScheduleDAGInstrs *createMachineScheduler(
+ MachineSchedContext *C) const override {
+ return createR600MachineScheduler(C);
+ }
+
bool addPreISel() override;
void addPreRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
};
-class GCNPassConfig : public AMDGPUPassConfig {
+class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) { }
+
+ GCNTargetMachine &getGCNTargetMachine() const {
+ return getTM<GCNTargetMachine>();
+ }
+
+ ScheduleDAGInstrs *
+ createMachineScheduler(MachineSchedContext *C) const override;
+
bool addPreISel() override;
+ void addMachineSSAOptimization() override;
bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+ bool addIRTranslator() override;
+ bool addRegBankSelect() override;
+#endif
void addFastRegAlloc(FunctionPass *RegAllocPass) override;
void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
void addPreRegAlloc() override;
- void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
};
@@ -188,12 +327,39 @@ public:
TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
return TargetIRAnalysis([this](const Function &F) {
- return TargetTransformInfo(
- AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
+ return TargetTransformInfo(AMDGPUTTIImpl(this, F));
});
}
+void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
+ if (getOptLevel() == CodeGenOpt::Aggressive)
+ addPass(createGVNPass());
+ else
+ addPass(createEarlyCSEPass());
+}
+
+void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+ addPass(createSeparateConstOffsetFromGEPPass());
+ addPass(createSpeculativeExecutionPass());
+ // ReassociateGEPs exposes more opportunites for SLSR. See
+ // the example in reassociate-geps-and-slsr.ll.
+ addPass(createStraightLineStrengthReducePass());
+ // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
+ // EarlyCSE can reuse.
+ addEarlyCSEOrGVNPass();
+ // Run NaryReassociate after EarlyCSE/GVN to be more effective.
+ addPass(createNaryReassociatePass());
+ // NaryReassociate on GEPs creates redundant common expressions, so run
+ // EarlyCSE after it.
+ addPass(createEarlyCSEPass());
+}
+
void AMDGPUPassConfig::addIRPasses() {
+ // There is no reason to run these.
+ disablePass(&StackMapLivenessID);
+ disablePass(&FuncletLayoutID);
+ disablePass(&PatchableFunctionID);
+
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerPass());
@@ -207,24 +373,43 @@ void AMDGPUPassConfig::addIRPasses() {
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
addPass(createAMDGPUOpenCLImageTypeLoweringPass());
+ const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+ if (TM.getOptLevel() > CodeGenOpt::None) {
+ addPass(createAMDGPUPromoteAlloca(&TM));
+
+ if (EnableSROA)
+ addPass(createSROAPass());
+ }
+
+ addStraightLineScalarOptimizationPasses();
+
TargetPassConfig::addIRPasses();
+
+ // EarlyCSE is not always strong enough to clean up what LSR produces. For
+ // example, GVN can combine
+ //
+ // %0 = add %a, %b
+ // %1 = add %b, %a
+ //
+ // and
+ //
+ // %0 = shl nsw %a, 2
+ // %1 = shl %a, 2
+ //
+ // but EarlyCSE can do neither of them.
+ if (getOptLevel() != CodeGenOpt::None)
+ addEarlyCSEOrGVNPass();
}
void AMDGPUPassConfig::addCodeGenPrepare() {
- const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
- if (ST.isPromoteAllocaEnabled()) {
- addPass(createAMDGPUPromoteAlloca(ST));
- addPass(createSROAPass());
- }
TargetPassConfig::addCodeGenPrepare();
+
+ if (EnableLoadStoreVectorizer)
+ addPass(createLoadStoreVectorizerPass());
}
-bool
-AMDGPUPassConfig::addPreISel() {
- const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+bool AMDGPUPassConfig::addPreISel() {
addPass(createFlattenCFGPass());
- if (ST.IsIRStructurizerEnabled())
- addPass(createStructurizeCFGPass());
return false;
}
@@ -244,7 +429,9 @@ bool AMDGPUPassConfig::addGCPasses() {
bool R600PassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
- addPass(createR600TextureIntrinsicsReplacer());
+
+ if (EnableR600StructurizeCFG)
+ addPass(createStructurizeCFGPass());
return false;
}
@@ -253,9 +440,8 @@ void R600PassConfig::addPreRegAlloc() {
}
void R600PassConfig::addPreSched2() {
- const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
addPass(createR600EmitClauseMarkers(), false);
- if (ST.isIfCvtEnabled())
+ if (EnableR600IfConvert)
addPass(&IfConverterID, false);
addPass(createR600ClauseMergePass(*TM), false);
}
@@ -276,32 +462,62 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
// GCN Pass Setup
//===----------------------------------------------------------------------===//
+ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
+ MachineSchedContext *C) const {
+ const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
+ if (ST.enableSIScheduler())
+ return createSIMachineScheduler(C);
+ return nullptr;
+}
+
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.
addPass(&AMDGPUAnnotateKernelFeaturesID);
-
+ addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
addPass(createSinkingPass());
addPass(createSITypeRewriter());
- addPass(createSIAnnotateControlFlowPass());
addPass(createAMDGPUAnnotateUniformValues());
+ addPass(createSIAnnotateControlFlowPass());
return false;
}
+void GCNPassConfig::addMachineSSAOptimization() {
+ TargetPassConfig::addMachineSSAOptimization();
+
+ // We want to fold operands after PeepholeOptimizer has run (or as part of
+ // it), because it will eliminate extra copies making it easier to fold the
+ // real source operand. We want to eliminate dead instructions after, so that
+ // we see fewer uses of the copies. We then need to clean up the dead
+ // instructions leftover after the operands are folded as well.
+ //
+ // XXX - Can we get away without running DeadMachineInstructionElim again?
+ addPass(&SIFoldOperandsID);
+ addPass(&DeadMachineInstructionElimID);
+}
+
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(createSILowerI1CopiesPass());
addPass(&SIFixSGPRCopiesID);
- addPass(createSIFoldOperandsPass());
return false;
}
-void GCNPassConfig::addPreRegAlloc() {
- const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl();
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool GCNPassConfig::addIRTranslator() {
+ addPass(new IRTranslator());
+ return false;
+}
+bool GCNPassConfig::addRegBankSelect() {
+ return false;
+}
+#endif
+
+void GCNPassConfig::addPreRegAlloc() {
// This needs to be run directly before register allocation because
// earlier passes might recompute live intervals.
// TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
@@ -309,42 +525,48 @@ void GCNPassConfig::addPreRegAlloc() {
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
}
- if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) {
+ if (getOptLevel() > CodeGenOpt::None) {
// Don't do this with no optimizations since it throws away debug info by
// merging nonadjacent loads.
// This should be run after scheduling, but before register allocation. It
// also need extra copies to the address operand to be eliminated.
+
+ // FIXME: Move pre-RA and remove extra reg coalescer run.
insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
insertPass(&MachineSchedulerID, &RegisterCoalescerID);
}
- addPass(createSIShrinkInstructionsPass(), false);
+
+ addPass(createSIShrinkInstructionsPass());
+ addPass(createSIWholeQuadModePass());
}
void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
- addPass(&SIFixSGPRLiveRangesID);
TargetPassConfig::addFastRegAlloc(RegAllocPass);
}
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
- // We want to run this after LiveVariables is computed to avoid computing them
- // twice.
- // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure
- // that needs to be fixed.
- insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false);
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
-void GCNPassConfig::addPostRegAlloc() {
- addPass(createSIShrinkInstructionsPass(), false);
-}
-
void GCNPassConfig::addPreSched2() {
}
void GCNPassConfig::addPreEmitPass() {
- addPass(createSIInsertWaits(*TM), false);
- addPass(createSILowerControlFlowPass(*TM), false);
+ // The hazard recognizer that runs as part of the post-ra scheduler does not
+ // guarantee to be able handle all hazards correctly. This is because if there
+ // are multiple scheduling regions in a basic block, the regions are scheduled
+ // bottom up, so when we begin to schedule a region we don't know what
+ // instructions were emitted directly before it.
+ //
+ // Here we add a stand-alone hazard recognizer pass which can handle all
+ // cases.
+ addPass(&PostRAHazardRecognizerID);
+
+ addPass(createSIInsertWaitsPass());
+ addPass(createSIShrinkInstructionsPass());
+ addPass(createSILowerControlFlowPass());
+ addPass(createSIDebuggerInsertNopsPass());
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {