diff options
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUTargetMachine.cpp')
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 370 |
1 files changed, 296 insertions, 74 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 519ae5cc748d..3e53f52c689f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -14,19 +14,23 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" -#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" +#include "AMDGPUCallLowering.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" + #include "llvm/Analysis/Passes.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/IR/LegacyPassManager.h" @@ -34,10 +38,35 @@ #include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" -#include <llvm/CodeGen/Passes.h> +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Vectorize.h" using namespace llvm; +static cl::opt<bool> EnableR600StructurizeCFG( + "r600-ir-structurize", + cl::desc("Use StructurizeCFG IR pass"), + cl::init(true)); + +static cl::opt<bool> EnableSROA( + "amdgpu-sroa", + cl::desc("Run SROA after promote alloca pass"), + cl::ReallyHidden, + cl::init(true)); + +static cl::opt<bool> EnableR600IfConvert( + "r600-if-convert", + cl::desc("Use if conversion pass"), + cl::ReallyHidden, + cl::init(true)); + +// Option to disable vectorizer for tests. +static cl::opt<bool> EnableLoadStoreVectorizer( + "amdgpu-load-store-vectorizer", + cl::desc("Enable load store vectorizer"), + cl::init(false), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); @@ -47,17 +76,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); - initializeSIFixSGPRLiveRangesPass(*PR); + initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPUPromoteAllocaPass(*PR); + initializeAMDGPUCodeGenPreparePass(*PR); + initializeSIAnnotateControlFlowPass(*PR); + initializeSIDebuggerInsertNopsPass(*PR); + initializeSIInsertWaitsPass(*PR); + initializeSIWholeQuadModePass(*PR); + initializeSILowerControlFlowPass(*PR); + initializeSIDebuggerInsertNopsPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { - if (TT.getOS() == Triple::AMDHSA) - return make_unique<AMDGPUHSATargetObjectFile>(); - return make_unique<AMDGPUTargetObjectFile>(); } @@ -73,60 +107,156 @@ static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler); -static std::string computeDataLayout(const Triple &TT) { - std::string Ret = "e-p:32:32"; - - if (TT.getArch() == Triple::amdgcn) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; +static StringRef computeDataLayout(const Triple &TT) { + if (TT.getArch() == Triple::r600) { + // 32-bit pointers. + return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; + // 32-bit private, local, and region pointers. 64-bit global, constant and + // flat. + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; +} + +LLVM_READNONE +static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { + if (!GPU.empty()) + return GPU; - return Ret; + // HSA only supports CI+, so change the default GPU to a CI for HSA. + if (TT.getArch() == Triple::amdgcn) + return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; + + return "r600"; +} + +static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { + // The AMDGPU toolchain only supports generating shared objects, so we + // must always use PIC. + return Reloc::PIC_; } AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - TargetOptions Options, Reloc::Model RM, + TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, - OptLevel), - TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this), - IntrinsicInfo() { + : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), + FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), + TLOF(createTLOF(getTargetTriple())), + IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } AMDGPUTargetMachine::~AMDGPUTargetMachine() { } +StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { + Attribute GPUAttr = F.getFnAttribute("target-cpu"); + return GPUAttr.hasAttribute(Attribute::None) ? + getTargetCPU() : GPUAttr.getValueAsString(); +} + +StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { + Attribute FSAttr = F.getFnAttribute("target-features"); + + return FSAttr.hasAttribute(Attribute::None) ? + getTargetFeatureString() : + FSAttr.getValueAsString(); +} + //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, + StringRef CPU, StringRef FS, + TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +const R600Subtarget *R600TargetMachine::getSubtargetImpl( + const Function &F) const { + StringRef GPU = getGPUName(F); + StringRef FS = getFeatureString(F); + + SmallString<128> SubtargetKey(GPU); + SubtargetKey.append(FS); + + auto &I = SubtargetMap[SubtargetKey]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); + } + + return I.get(); +} //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { +struct SIGISelActualAccessor : public GISelAccessor { + std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + const AMDGPUCallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } +}; +} // End anonymous namespace. +#endif + GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, + StringRef CPU, StringRef FS, + TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { + StringRef GPU = getGPUName(F); + StringRef FS = getFeatureString(F); + + SmallString<128> SubtargetKey(GPU); + SubtargetKey.append(FS); + + auto &I = SubtargetMap[SubtargetKey]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); + +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); + GISel->CallLoweringInfo.reset( + new AMDGPUCallLowering(*I->getTargetLowering())); +#endif + + I->setGISelAccessor(*GISel); + } + + return I.get(); +} //===----------------------------------------------------------------------===// // AMDGPU Pass Setup //===----------------------------------------------------------------------===// namespace { + class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) @@ -142,16 +272,8 @@ public: return getTM<AMDGPUTargetMachine>(); } - ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - return createR600MachineScheduler(C); - else if (ST.enableSIScheduler()) - return createSIMachineScheduler(C); - return nullptr; - } - + void addEarlyCSEOrGVNPass(); + void addStraightLineScalarOptimizationPasses(); void addIRPasses() override; void addCodeGenPrepare() override; bool addPreISel() override; @@ -159,27 +281,44 @@ public: bool addGCPasses() override; }; -class R600PassConfig : public AMDGPUPassConfig { +class R600PassConfig final : public AMDGPUPassConfig { public: R600PassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } + ScheduleDAGInstrs *createMachineScheduler( + MachineSchedContext *C) const override { + return createR600MachineScheduler(C); + } + bool addPreISel() override; void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; -class GCNPassConfig : public AMDGPUPassConfig { +class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } + + GCNTargetMachine &getGCNTargetMachine() const { + return getTM<GCNTargetMachine>(); + } + + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override; + bool addPreISel() override; + void addMachineSSAOptimization() override; bool addInstSelector() override; +#ifdef LLVM_BUILD_GLOBAL_ISEL + bool addIRTranslator() override; + bool addRegBankSelect() override; +#endif void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; - void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; @@ -188,12 +327,39 @@ public: TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo( - AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); + return TargetTransformInfo(AMDGPUTTIImpl(this, F)); }); } +void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createGVNPass()); + else + addPass(createEarlyCSEPass()); +} + +void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + addPass(createSeparateConstOffsetFromGEPPass()); + addPass(createSpeculativeExecutionPass()); + // ReassociateGEPs exposes more opportunites for SLSR. See + // the example in reassociate-geps-and-slsr.ll. + addPass(createStraightLineStrengthReducePass()); + // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or + // EarlyCSE can reuse. + addEarlyCSEOrGVNPass(); + // Run NaryReassociate after EarlyCSE/GVN to be more effective. + addPass(createNaryReassociatePass()); + // NaryReassociate on GEPs creates redundant common expressions, so run + // EarlyCSE after it. + addPass(createEarlyCSEPass()); +} + void AMDGPUPassConfig::addIRPasses() { + // There is no reason to run these. + disablePass(&StackMapLivenessID); + disablePass(&FuncletLayoutID); + disablePass(&PatchableFunctionID); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerPass()); @@ -207,24 +373,43 @@ void AMDGPUPassConfig::addIRPasses() { // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + if (TM.getOptLevel() > CodeGenOpt::None) { + addPass(createAMDGPUPromoteAlloca(&TM)); + + if (EnableSROA) + addPass(createSROAPass()); + } + + addStraightLineScalarOptimizationPasses(); + TargetPassConfig::addIRPasses(); + + // EarlyCSE is not always strong enough to clean up what LSR produces. For + // example, GVN can combine + // + // %0 = add %a, %b + // %1 = add %b, %a + // + // and + // + // %0 = shl nsw %a, 2 + // %1 = shl %a, 2 + // + // but EarlyCSE can do neither of them. + if (getOptLevel() != CodeGenOpt::None) + addEarlyCSEOrGVNPass(); } void AMDGPUPassConfig::addCodeGenPrepare() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.isPromoteAllocaEnabled()) { - addPass(createAMDGPUPromoteAlloca(ST)); - addPass(createSROAPass()); - } TargetPassConfig::addCodeGenPrepare(); + + if (EnableLoadStoreVectorizer) + addPass(createLoadStoreVectorizerPass()); } -bool -AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); +bool AMDGPUPassConfig::addPreISel() { addPass(createFlattenCFGPass()); - if (ST.IsIRStructurizerEnabled()) - addPass(createStructurizeCFGPass()); return false; } @@ -244,7 +429,9 @@ bool AMDGPUPassConfig::addGCPasses() { bool R600PassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - addPass(createR600TextureIntrinsicsReplacer()); + + if (EnableR600StructurizeCFG) + addPass(createStructurizeCFGPass()); return false; } @@ -253,9 +440,8 @@ void R600PassConfig::addPreRegAlloc() { } void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) + if (EnableR600IfConvert) addPass(&IfConverterID, false); addPass(createR600ClauseMergePass(*TM), false); } @@ -276,32 +462,62 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { // GCN Pass Setup //===----------------------------------------------------------------------===// +ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( + MachineSchedContext *C) const { + const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); + if (ST.enableSIScheduler()) + return createSIMachineScheduler(C); + return nullptr; +} + bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(&AMDGPUAnnotateKernelFeaturesID); - + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); addPass(createAMDGPUAnnotateUniformValues()); + addPass(createSIAnnotateControlFlowPass()); return false; } +void GCNPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + + // We want to fold operands after PeepholeOptimizer has run (or as part of + // it), because it will eliminate extra copies making it easier to fold the + // real source operand. We want to eliminate dead instructions after, so that + // we see fewer uses of the copies. We then need to clean up the dead + // instructions leftover after the operands are folded as well. + // + // XXX - Can we get away without running DeadMachineInstructionElim again? + addPass(&SIFoldOperandsID); + addPass(&DeadMachineInstructionElimID); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); - addPass(createSIFoldOperandsPass()); return false; } -void GCNPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); +#ifdef LLVM_BUILD_GLOBAL_ISEL +bool GCNPassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} +bool GCNPassConfig::addRegBankSelect() { + return false; +} +#endif + +void GCNPassConfig::addPreRegAlloc() { // This needs to be run directly before register allocation because // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass @@ -309,42 +525,48 @@ void GCNPassConfig::addPreRegAlloc() { insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + if (getOptLevel() > CodeGenOpt::None) { // Don't do this with no optimizations since it throws away debug info by // merging nonadjacent loads. // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. + + // FIXME: Move pre-RA and remove extra reg coalescer run. insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } - addPass(createSIShrinkInstructionsPass(), false); + + addPass(createSIShrinkInstructionsPass()); + addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { - addPass(&SIFixSGPRLiveRangesID); TargetPassConfig::addFastRegAlloc(RegAllocPass); } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - // We want to run this after LiveVariables is computed to avoid computing them - // twice. - // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure - // that needs to be fixed. - insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false); TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } -void GCNPassConfig::addPostRegAlloc() { - addPass(createSIShrinkInstructionsPass(), false); -} - void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { - addPass(createSIInsertWaits(*TM), false); - addPass(createSILowerControlFlowPass(*TM), false); + // The hazard recognizer that runs as part of the post-ra scheduler does not + // guarantee to be able handle all hazards correctly. This is because if there + // are multiple scheduling regions in a basic block, the regions are scheduled + // bottom up, so when we begin to schedule a region we don't know what + // instructions were emitted directly before it. + // + // Here we add a stand-alone hazard recognizer pass which can handle all + // cases. + addPass(&PostRAHazardRecognizerID); + + addPass(createSIInsertWaitsPass()); + addPass(createSIShrinkInstructionsPass()); + addPass(createSILowerControlFlowPass()); + addPass(createSIDebuggerInsertNopsPass()); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { |