diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2016-07-23 20:41:05 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2016-07-23 20:41:05 +0000 |
commit | 01095a5d43bbfde13731688ddcf6048ebb8b7721 (patch) | |
tree | 4def12e759965de927d963ac65840d663ef9d1ea /lib/Target/AMDGPU | |
parent | f0f4822ed4b66e3579e92a89f368f8fb860e218e (diff) | |
download | src-vendor/llvm/llvm-release_39-r276489.tar.gz src-vendor/llvm/llvm-release_39-r276489.zip |
Vendor import of llvm release_39 branch r276489:vendor/llvm/llvm-release_39-r276489
Diffstat (limited to 'lib/Target/AMDGPU')
145 files changed, 16770 insertions, 9267 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 4f718e1ca310..7e59710a427a 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPU_H -#define LLVM_LIB_TARGET_R600_AMDGPU_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetMachine.h" @@ -29,7 +29,6 @@ class TargetMachine; // R600 Passes FunctionPass *createR600VectorRegMerger(TargetMachine &tm); -FunctionPass *createR600TextureIntrinsicsReplacer(); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); FunctionPass *createR600EmitClauseMarkers(); FunctionPass *createR600ClauseMergePass(TargetMachine &tm); @@ -44,12 +43,14 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); -FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); +FunctionPass *createSIWholeQuadModePass(); +FunctionPass *createSILowerControlFlowPass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); -FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); -FunctionPass *createSIInsertWaits(TargetMachine &tm); +FunctionPass *createSIDebuggerInsertNopsPass(); +FunctionPass *createSIInsertWaitsPass(); +FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr); ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); @@ -60,6 +61,9 @@ extern char &AMDGPUAnnotateKernelFeaturesID; void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIShrinkInstructionsPass(PassRegistry&); +extern char &SIShrinkInstructionsID; + void initializeSIFixSGPRCopiesPass(PassRegistry &); extern char &SIFixSGPRCopiesID; @@ -69,8 +73,19 @@ extern char &SILowerI1CopiesID; void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; +void initializeSIWholeQuadModePass(PassRegistry &); +extern char &SIWholeQuadModeID; + +void initializeSILowerControlFlowPass(PassRegistry &); +extern char &SILowerControlFlowPassID; + + // Passes common to R600 and SI -FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); +FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr); +void initializeAMDGPUPromoteAllocaPass(PassRegistry&); +extern char &AMDGPUPromoteAllocaID; + +FunctionPass *createAMDGPUAddDivergenceMetadata(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); @@ -80,12 +95,21 @@ FunctionPass *createAMDGPUAnnotateUniformValues(); void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; -void initializeSIFixSGPRLiveRangesPass(PassRegistry&); -extern char &SIFixSGPRLiveRangesID; - void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); extern char &AMDGPUAnnotateUniformValuesPassID; +void initializeAMDGPUCodeGenPreparePass(PassRegistry&); +extern char &AMDGPUCodeGenPrepareID; + +void initializeSIAnnotateControlFlowPass(PassRegistry&); +extern char &SIAnnotateControlFlowPassID; + +void initializeSIDebuggerInsertNopsPass(PassRegistry&); +extern char &SIDebuggerInsertNopsID; + +void initializeSIInsertWaitsPass(PassRegistry&); +extern char &SIInsertWaitsID; + extern Target TheAMDGPUTarget; extern Target TheGCNTarget; @@ -101,15 +125,6 @@ enum TargetIndex { } // End namespace llvm -namespace ShaderType { - enum Type { - PIXEL = 0, - VERTEX = 1, - GEOMETRY = 2, - COMPUTE = 3 - }; -} - /// OpenCL uses address spaces to differentiate between /// various memory regions on the hardware. On the CPU /// all of the address spaces point to the same memory, @@ -120,7 +135,7 @@ namespace AMDGPUAS { enum AddressSpaces : unsigned { PRIVATE_ADDRESS = 0, ///< Address space for private memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - CONSTANT_ADDRESS = 2, ///< Address space for constant memory + CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. FLAT_ADDRESS = 4, ///< Address space for flat memory. REGION_ADDRESS = 5, ///< Address space for region memory. @@ -148,8 +163,6 @@ enum AddressSpaces : unsigned { CONSTANT_BUFFER_13 = 21, CONSTANT_BUFFER_14 = 22, CONSTANT_BUFFER_15 = 23, - ADDRESS_NONE = 24, ///< Address space for unknown memory. - LAST_ADDRESS = ADDRESS_NONE, // Some places use this if the address space can't be determined. UNKNOWN_ADDRESS_SPACE = ~0u diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 844d89c737bf..72c455354411 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -1,182 +1,121 @@ -//===-- AMDGPU.td - AMDGPU Tablegen files ------------------*- tablegen -*-===// +//===-- AMDGPU.td - AMDGPU Tablegen files --------*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -//===----------------------------------------------------------------------===// +//===------------------------------------------------------------===// include "llvm/Target/Target.td" -//===----------------------------------------------------------------------===// -// Subtarget Features -//===----------------------------------------------------------------------===// - -// Debugging Features - -def FeatureDumpCode : SubtargetFeature <"DumpCode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", - "DumpCode", - "true", - "Dump MachineInstrs in the CodeEmitter">; - -def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer", - "EnableIRStructurizer", - "false", - "Disable IR Structurizer">; - -def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", - "EnablePromoteAlloca", - "true", - "Enable promote alloca pass">; - -// Target features - -def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", - "EnableIfCvt", - "false", - "Disable the if conversion pass">; +//===------------------------------------------------------------===// +// Subtarget Features (device properties) +//===------------------------------------------------------------===// def FeatureFP64 : SubtargetFeature<"fp64", - "FP64", - "true", - "Enable double precision operations">; - -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64Denormals", - "true", - "Enable double precision denormal handling", - [FeatureFP64]>; + "FP64", + "true", + "Enable double precision operations" +>; def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", - "FastFMAF32", - "true", - "Assuming f32 fma is at least as fast as mul + add", - []>; - -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling">; + "FastFMAF32", + "true", + "Assuming f32 fma is at least as fast as mul + add" +>; -def Feature64BitPtr : SubtargetFeature<"64BitPtr", - "Is64bit", - "true", - "Specify if 64-bit addressing should be used">; +def HalfRate64Ops : SubtargetFeature<"half-rate-64-ops", + "HalfRate64Ops", + "true", + "Most fp64 instructions are half rate instead of quarter" +>; def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", - "R600ALUInst", - "false", - "Older version of ALU instructions encoding">; + "R600ALUInst", + "false", + "Older version of ALU instructions encoding" +>; def FeatureVertexCache : SubtargetFeature<"HasVertexCache", - "HasVertexCache", - "true", - "Specify use of dedicated vertex cache">; + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache" +>; def FeatureCaymanISA : SubtargetFeature<"caymanISA", - "CaymanISA", - "true", - "Use Cayman ISA">; + "CaymanISA", + "true", + "Use Cayman ISA" +>; def FeatureCFALUBug : SubtargetFeature<"cfalubug", - "CFALUBug", - "true", - "GPU has CF_ALU bug">; - -// XXX - This should probably be removed once enabled by default -def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", - "EnableLoadStoreOpt", - "true", - "Enable SI load/store optimizer pass">; - -// Performance debugging feature. Allow using DS instruction immediate -// offsets even if the base pointer can't be proven to be base. On SI, -// base pointer values that won't give the same result as a 16-bit add -// are not safe to fold, but this will override the conservative test -// for the base pointer. -def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-folding", - "EnableUnsafeDSOffsetFolding", - "true", - "Force using DS instruction immediate offsets on SI">; - -def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", - "FlatForGlobal", - "true", - "Force to generate flat instruction for global">; + "CFALUBug", + "true", + "GPU has CF_ALU bug" +>; def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", - "FlatAddressSpace", - "true", - "Support flat address space">; + "FlatAddressSpace", + "true", + "Support flat address space" +>; -def FeatureXNACK : SubtargetFeature<"xnack", - "EnableXNACK", - "true", - "Enable XNACK support">; +def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", + "UnalignedBufferAccess", + "true", + "Support unaligned global loads and stores" +>; -def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", - "EnableVGPRSpilling", - "true", - "Enable spilling of VGPRs to scratch memory">; +def FeatureXNACK : SubtargetFeature<"xnack", + "EnableXNACK", + "true", + "Enable XNACK support" +>; def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", - "SGPRInitBug", - "true", - "VI SGPR initilization bug requiring a fixed SGPR allocation size">; - -def FeatureEnableHugeScratchBuffer : SubtargetFeature<"huge-scratch-buffer", - "EnableHugeScratchBuffer", - "true", - "Enable scratch buffer sizes greater than 128 GB">; - -def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", - "EnableSIScheduler", - "true", - "Enable SI Machine Scheduler">; + "SGPRInitBug", + "true", + "VI SGPR initilization bug requiring a fixed SGPR allocation size" +>; class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, - "TexVTXClauseSize", - Value, - "Limit the maximum number of fetches in a clause to "#Value>; + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value +>; def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature< - "wavefrontsize"#Value, - "WavefrontSize", - !cast<string>(Value), - "The number of threads per wavefront">; + "wavefrontsize"#Value, + "WavefrontSize", + !cast<string>(Value), + "The number of threads per wavefront" +>; def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature < - "ldsbankcount"#Value, - "LDSBankCount", - !cast<string>(Value), - "The number of LDS banks per compute unit.">; + "ldsbankcount"#Value, + "LDSBankCount", + !cast<string>(Value), + "The number of LDS banks per compute unit." +>; def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping> : SubtargetFeature < - "isaver"#Major#"."#Minor#"."#Stepping, - "IsaVersion", - "ISAVersion"#Major#"_"#Minor#"_"#Stepping, - "Instruction set version number" + "isaver"#Major#"."#Minor#"."#Stepping, + "IsaVersion", + "ISAVersion"#Major#"_"#Minor#"_"#Stepping, + "Instruction set version number" >; def FeatureISAVersion7_0_0 : SubtargetFeatureISAVersion <7,0,0>; @@ -186,36 +125,145 @@ def FeatureISAVersion8_0_1 : SubtargetFeatureISAVersion <8,0,1>; def FeatureISAVersion8_0_3 : SubtargetFeatureISAVersion <8,0,3>; class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< - "localmemorysize"#Value, - "LocalMemorySize", - !cast<string>(Value), - "The size of local memory in bytes">; + "localmemorysize"#Value, + "LocalMemorySize", + !cast<string>(Value), + "The size of local memory in bytes" +>; def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU">; + "IsGCN", + "true", + "GCN or newer GPU" +>; def FeatureGCN1Encoding : SubtargetFeature<"gcn1-encoding", - "GCN1Encoding", - "true", - "Encoding format for SI and CI">; + "GCN1Encoding", + "true", + "Encoding format for SI and CI" +>; def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", - "GCN3Encoding", - "true", - "Encoding format for VI">; + "GCN3Encoding", + "true", + "Encoding format for VI" +>; def FeatureCIInsts : SubtargetFeature<"ci-insts", - "CIInsts", - "true", - "Additional intstructions for CI+">; + "CIInsts", + "true", + "Additional intstructions for CI+" +>; + +def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime", + "HasSMemRealTime", + "true", + "Has s_memrealtime instruction" +>; + +def Feature16BitInsts : SubtargetFeature<"16-bit-insts", + "Has16BitInsts", + "true", + "Has i16/f16 instructions" +>; + +//===------------------------------------------------------------===// +// Subtarget Features (options and debugging) +//===------------------------------------------------------------===// + +// Some instructions do not support denormals despite this flag. Using +// fp32 denormals also causes instructions to run at the double +// precision rate for the device. +def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", + "FP32Denormals", + "true", + "Enable single precision denormal handling" +>; + +def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", + "FP64Denormals", + "true", + "Enable double precision denormal handling", + [FeatureFP64] +>; + +def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", + "FPExceptions", + "true", + "Enable floating point exceptions" +>; + +class FeatureMaxPrivateElementSize<int size> : SubtargetFeature< + "max-private-element-size-"#size, + "MaxPrivateElementSize", + !cast<string>(size), + "Maximum private access size may be "#size +>; + +def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; +def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; +def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; + +def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", + "EnableVGPRSpilling", + "true", + "Enable spilling of VGPRs to scratch memory" +>; + +def FeatureDumpCode : SubtargetFeature <"DumpCode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter" +>; + +def FeatureDumpCodeLower : SubtargetFeature <"dumpcode", + "DumpCode", + "true", + "Dump MachineInstrs in the CodeEmitter" +>; + +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass" +>; + +// XXX - This should probably be removed once enabled by default +def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", + "EnableLoadStoreOpt", + "true", + "Enable SI load/store optimizer pass" +>; + +// Performance debugging feature. Allow using DS instruction immediate +// offsets even if the base pointer can't be proven to be base. On SI, +// base pointer values that won't give the same result as a 16-bit add +// are not safe to fold, but this will override the conservative test +// for the base pointer. +def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature < + "unsafe-ds-offset-folding", + "EnableUnsafeDSOffsetFolding", + "true", + "Force using DS instruction immediate offsets on SI" +>; + +def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", + "EnableSIScheduler", + "true", + "Enable SI Machine Scheduler" +>; + +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global" +>; // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", - "FeatureDisable","true", - "Dummy feature to disable assembler" - " instructions">; + "FeatureDisable","true", + "Dummy feature to disable assembler instructions" +>; class SubtargetFeatureGeneration <string Value, list<SubtargetFeature> Implies> : @@ -227,33 +275,66 @@ def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] +>; def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16, FeatureLocalMemorySize0]>; + [FeatureFetchLimit16, FeatureLocalMemorySize0] +>; def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; + [FeatureFetchLimit16, FeatureLocalMemorySize32768] +>; def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureLocalMemorySize32768] + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] >; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768, - FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, - FeatureLDSBankCount32]>; + [FeatureFP64, FeatureLocalMemorySize32768, + FeatureWavefrontSize64, FeatureGCN, FeatureGCN1Encoding, + FeatureLDSBankCount32] +>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, - FeatureGCN1Encoding, FeatureCIInsts]>; + [FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, + FeatureGCN1Encoding, FeatureCIInsts] +>; def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", - [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536, - FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, - FeatureGCN3Encoding, FeatureCIInsts]>; + [FeatureFP64, FeatureLocalMemorySize65536, + FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, + FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime + ] +>; + +//===----------------------------------------------------------------------===// +// Debugger related subtarget features. +//===----------------------------------------------------------------------===// + +def FeatureDebuggerInsertNops : SubtargetFeature< + "amdgpu-debugger-insert-nops", + "DebuggerInsertNops", + "true", + "Insert one nop instruction for each high level source statement" +>; + +def FeatureDebuggerReserveRegs : SubtargetFeature< + "amdgpu-debugger-reserve-regs", + "DebuggerReserveRegs", + "true", + "Reserve registers for debugger usage" +>; + +def FeatureDebuggerEmitPrologue : SubtargetFeature< + "amdgpu-debugger-emit-prologue", + "DebuggerEmitPrologue", + "true", + "Emit debugger prologue" +>; //===----------------------------------------------------------------------===// @@ -283,6 +364,7 @@ def NullALU : InstrItinClass; //===----------------------------------------------------------------------===// def TruePredicate : Predicate<"true">; + def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" @@ -292,6 +374,13 @@ def isVI : Predicate < "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, AssemblerPredicate<"FeatureGCN3Encoding">; +def isCIVI : Predicate < + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" +>, AssemblerPredicate<"FeatureCIInsts">; + +def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp index ad267d350850..63f5fb3cdf00 100644 --- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp +++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp @@ -45,9 +45,8 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) { for (Function *F : FuncsToClone) { ValueToValueMapTy VMap; - Function *NewFunc = CloneFunction(F, VMap, false); + Function *NewFunc = CloneFunction(F, VMap); NewFunc->setLinkage(GlobalValue::InternalLinkage); - M.getFunctionList().push_back(NewFunc); F->replaceAllUsesWith(NewFunc); } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 378183927242..0910b2877b09 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -24,6 +25,8 @@ namespace { class AMDGPUAnnotateKernelFeatures : public ModulePass { private: + static bool hasAddrSpaceCast(const Function &F); + void addAttrToCallers(Function *Intrin, StringRef AttrName); bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); @@ -40,6 +43,11 @@ public: AU.setPreservesAll(); ModulePass::getAnalysisUsage(AU); } + + static bool visitConstantExpr(const ConstantExpr *CE); + static bool visitConstantExprsRecursively( + const Constant *EntryC, + SmallPtrSet<const Constant *, 8> &ConstantExprVisited); }; } @@ -48,12 +56,87 @@ char AMDGPUAnnotateKernelFeatures::ID = 0; char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; +INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + + +// The queue ptr is only needed when casting to flat, not from it. +static bool castRequiresQueuePtr(unsigned SrcAS) { + return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; +} + +static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { + return castRequiresQueuePtr(ASC->getSrcAddressSpace()); +} + +bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { + if (CE->getOpcode() == Instruction::AddrSpaceCast) { + unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); + return castRequiresQueuePtr(SrcAS); + } + + return false; +} + +bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( + const Constant *EntryC, + SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { -INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) -INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, - "Add AMDGPU function attributes", false, false) + if (!ConstantExprVisited.insert(EntryC).second) + return false; + SmallVector<const Constant *, 16> Stack; + Stack.push_back(EntryC); + + while (!Stack.empty()) { + const Constant *C = Stack.pop_back_val(); + + // Check this constant expression. + if (const auto *CE = dyn_cast<ConstantExpr>(C)) { + if (visitConstantExpr(CE)) + return true; + } + + // Visit all sub-expressions. + for (const Use &U : C->operands()) { + const auto *OpC = dyn_cast<Constant>(U); + if (!OpC) + continue; + + if (!ConstantExprVisited.insert(OpC).second) + continue; + + Stack.push_back(OpC); + } + } + + return false; +} + +// Return true if an addrspacecast is used that requires the queue ptr. +bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) { + SmallPtrSet<const Constant *, 8> ConstantExprVisited; + + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { + if (castRequiresQueuePtr(ASC)) + return true; + } + + for (const Use &U : I.operands()) { + const auto *OpC = dyn_cast<Constant>(U); + if (!OpC) + continue; + + if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) + return true; + } + } + } + + return false; +} void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, StringRef AttrName) { @@ -89,35 +172,46 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { static const StringRef IntrinsicToAttr[][2] = { // .x omitted + { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" }, + { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" }, + + { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, + { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, // .x omitted { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } - }; static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" }, - - { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" }, - { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" } + { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, + { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" } }; + // TODO: We should not add the attributes if the known compile time workgroup + // size is 1 for y/z. + // TODO: Intrinsics that require queue ptr. // We do not need to note the x workitem or workgroup id because they are // always initialized. bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA) + if (TT.getOS() == Triple::AMDHSA) { Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + for (Function &F : M) { + if (F.hasFnAttribute("amdgpu-queue-ptr")) + continue; + + if (hasAddrSpaceCast(F)) + F.addFnAttr("amdgpu-queue-ptr"); + } + } + return Changed; } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index dfddc345f286..2010cc952265 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -43,6 +43,7 @@ public: AU.setPreservesAll(); } + void visitBranchInst(BranchInst &I); void visitLoadInst(LoadInst &I); }; @@ -57,13 +58,28 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, char AMDGPUAnnotateUniformValues::ID = 0; +static void setUniformMetadata(Instruction *I) { + I->setMetadata("amdgpu.uniform", MDNode::get(I->getContext(), {})); +} + +void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { + if (I.isUnconditional()) + return; + + Value *Cond = I.getCondition(); + if (!DA->isUniform(Cond)) + return; + + setUniformMetadata(I.getParent()->getTerminator()); +} + void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) - PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + setUniformMetadata(PtrI); } @@ -72,6 +88,9 @@ bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { } bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + DA = &getAnalysis<DivergenceAnalysis>(); visit(F); diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 1239dfb235ef..cfe6346fb6b1 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -28,8 +28,10 @@ #include "R600RegisterInfo.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" +#include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" @@ -37,7 +39,9 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" +#include "AMDGPURuntimeMetadata.h" +using namespace ::AMDGPU; using namespace llvm; // TODO: This should get the default rounding mode from the kernel. We just set @@ -61,7 +65,7 @@ using namespace llvm; // instructions to run at the double precision rate for the device so it's // probably best to just report no single precision denormals. static uint32_t getFPMode(const MachineFunction &F) { - const AMDGPUSubtarget& ST = F.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget& ST = F.getSubtarget<SISubtarget>(); // TODO: Is there any real use for the flush in only / flush out only modes? uint32_t FP32Denormals = @@ -104,10 +108,12 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) { AMDGPUTargetStreamer *TS = static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); - TS->EmitDirectiveHSACodeObjectVersion(1, 0); + TS->EmitDirectiveHSACodeObjectVersion(2, 1); + AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits()); TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU"); + emitStartOfRuntimeMetadata(M); } void AMDGPUAsmPrinter::EmitFunctionBodyStart() { @@ -132,54 +138,13 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { AsmPrinter::EmitFunctionEntryLabel(); } -static bool isModuleLinkage(const GlobalValue *GV) { - switch (GV->getLinkage()) { - case GlobalValue::InternalLinkage: - case GlobalValue::CommonLinkage: - return true; - case GlobalValue::ExternalLinkage: - return false; - default: llvm_unreachable("unknown linkage type"); - } -} - void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { - if (TM.getTargetTriple().getOS() != Triple::AMDHSA) { - AsmPrinter::EmitGlobalVariable(GV); - return; - } - - if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) { - AsmPrinter::EmitGlobalVariable(GV); - return; - } - // Group segment variables aren't emitted in HSA. if (AMDGPU::isGroupSegment(GV)) return; - AMDGPUTargetStreamer *TS = - static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); - if (isModuleLinkage(GV)) { - TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName()); - } else { - TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); - } - - MCSymbolELF *GVSym = cast<MCSymbolELF>(getSymbol(GV)); - const DataLayout &DL = getDataLayout(); - - // Emit the size - uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); - OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext)); - OutStreamer->PushSection(); - OutStreamer->SwitchSection( - getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); - const Constant *C = GV->getInitializer(); - OutStreamer->EmitLabel(GVSym); - EmitGlobalConstant(DL, C); - OutStreamer->PopSection(); + AsmPrinter::EmitGlobalVariable(GV); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -230,6 +195,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), false); + OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + + " bytes/workgroup (compile time only)", false); + + OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), + false); + OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), + false); + + if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { + OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + + Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); + OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + + Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); + } OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), @@ -268,15 +247,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { } } + emitRuntimeMetadata(*MF.getFunction()); + return false; } void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { unsigned MaxGPR = 0; bool killPixel = false; - const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); - const R600RegisterInfo *RI = - static_cast<const R600RegisterInfo *>(STM.getRegisterInfo()); + const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>(); + const R600RegisterInfo *RI = STM.getRegisterInfo(); const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); for (const MachineBasicBlock &MBB : MF) { @@ -299,23 +279,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { } unsigned RsrcReg; - if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { + if (STM.getGeneration() >= R600Subtarget::EVERGREEN) { // Evergreen / Northern Islands - switch (MFI->getShaderType()) { + switch (MF.getFunction()->getCallingConv()) { default: // Fall through - case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; - case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; - case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; - case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; } } else { // R600 / R700 - switch (MFI->getShaderType()) { + switch (MF.getFunction()->getCallingConv()) { default: // Fall through - case ShaderType::GEOMETRY: // Fall through - case ShaderType::COMPUTE: // Fall through - case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; - case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + case CallingConv::AMDGPU_GS: // Fall through + case CallingConv::AMDGPU_CS: // Fall through + case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; } } @@ -325,23 +305,23 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) { OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - if (MFI->getShaderType() == ShaderType::COMPUTE) { + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); - OutStreamer->EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4); + OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4); } } void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, const MachineFunction &MF) const { - const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); uint64_t CodeSize = 0; unsigned MaxSGPR = 0; unsigned MaxVGPR = 0; bool VCCUsed = false; bool FlatUsed = false; - const SIRegisterInfo *RI = - static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); + const SIRegisterInfo *RI = STM.getRegisterInfo(); + const SIInstrInfo *TII = STM.getInstrInfo(); for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { @@ -351,8 +331,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (MI.isDebugValue()) continue; - // FIXME: This is reporting 0 for many instructions. - CodeSize += MI.getDesc().Size; + CodeSize += TII->getInstSizeInBytes(MI); unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { @@ -366,6 +345,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned reg = MO.getReg(); switch (reg) { case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: case AMDGPU::SCC: case AMDGPU::M0: continue; @@ -382,17 +363,32 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, FlatUsed = true; continue; + case AMDGPU::TBA: + case AMDGPU::TBA_LO: + case AMDGPU::TBA_HI: + case AMDGPU::TMA: + case AMDGPU::TMA_LO: + case AMDGPU::TMA_HI: + llvm_unreachable("Trap Handler registers should not be used"); + continue; + default: break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { + if (AMDGPU::TTMP_32RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 1; } else if (AMDGPU::VGPR_32RegClass.contains(reg)) { isSGPR = false; width = 1; } else if (AMDGPU::SReg_64RegClass.contains(reg)) { + if (AMDGPU::TTMP_64RegClass.contains(reg)) { + llvm_unreachable("Trap Handler registers should not be used"); + } isSGPR = true; width = 2; } else if (AMDGPU::VReg_64RegClass.contains(reg)) { @@ -438,7 +434,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) ExtraSGPRs = 2; - if (STM.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { if (FlatUsed) ExtraSGPRs = 4; } else { @@ -451,23 +447,54 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, MaxSGPR += ExtraSGPRs; + // Record first reserved register and reserved register count fields, and + // update max register counts if "amdgpu-debugger-reserve-regs" attribute was + // specified. + if (STM.debuggerReserveRegs()) { + ProgInfo.ReservedVGPRFirst = MaxVGPR + 1; + ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount(); + MaxVGPR += MFI->getDebuggerReservedVGPRCount(); + } + + // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and + // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" + // attribute was specified. + if (STM.debuggerEmitPrologue()) { + ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = + RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); + ProgInfo.DebuggerPrivateSegmentBufferSGPR = + RI->getHWRegIndex(MFI->getScratchRSrcReg()); + } + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; ProgInfo.NumSGPR = MaxSGPR + 1; if (STM.hasSGPRInitBug()) { - if (ProgInfo.NumSGPR > AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) { + if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) { LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("too many SGPRs used with the SGPR init bug"); + DiagnosticInfoResourceLimit Diag(*MF.getFunction(), + "SGPRs with SGPR init bug", + ProgInfo.NumSGPR, DS_Error); + Ctx.diagnose(Diag); } - ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("too many user SGPRs used"); + DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs", + MFI->NumUserSGPRs, DS_Error); + Ctx.diagnose(Diag); + } + + if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory", + MFI->LDSSize, DS_Error); + Ctx.diagnose(Diag); } ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; @@ -476,21 +503,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // register. ProgInfo.FloatMode = getFPMode(MF); - // XXX: Not quite sure what this does, but sc seems to unset this. ProgInfo.IEEEMode = 0; - // Do not clamp NAN to 0. - ProgInfo.DX10Clamp = 0; + // Make clamp modifier on NaN input returns 0. + ProgInfo.DX10Clamp = 1; const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - ProgInfo.ScratchSize = FrameInfo->estimateStackSize(MF); + ProgInfo.ScratchSize = FrameInfo->getStackSize(); ProgInfo.FlatUsed = FlatUsed; ProgInfo.VCCUsed = VCCUsed; ProgInfo.CodeLen = CodeSize; unsigned LDSAlignShift; - if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { + if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } else { @@ -503,7 +529,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize; ProgInfo.LDSBlocks = - RoundUpToAlignment(ProgInfo.LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift; // Scratch is allocated in 256 dword blocks. unsigned ScratchAlignShift = 10; @@ -511,8 +537,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // is used by the entire wave. ProgInfo.ScratchSize is the amount of // scratch memory used per thread. ProgInfo.ScratchBlocks = - RoundUpToAlignment(ProgInfo.ScratchSize * STM.getWavefrontSize(), - 1 << ScratchAlignShift) >> ScratchAlignShift; + alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(), + 1ULL << ScratchAlignShift) >> + ScratchAlignShift; ProgInfo.ComputePGMRSrc1 = S_00B848_VGPRS(ProgInfo.VGPRBlocks) | @@ -544,23 +571,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B84C_EXCP_EN(0); } -static unsigned getRsrcReg(unsigned ShaderType) { - switch (ShaderType) { +static unsigned getRsrcReg(CallingConv::ID CallConv) { + switch (CallConv) { default: // Fall through - case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; - case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; - case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; } } void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo) { - const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); + unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); - if (MFI->getShaderType() == ShaderType::COMPUTE) { + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); @@ -577,13 +604,13 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(MFI)) { + if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); } } - if (MFI->getShaderType() == ShaderType::PIXEL) { + if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); @@ -591,12 +618,31 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } + + OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4); + OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4); + OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4); + OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4); +} + +// This is supposed to be log2(Size) +static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { + switch (Size) { + case 4: + return AMD_ELEMENT_4_BYTES; + case 8: + return AMD_ELEMENT_8_BYTES; + case 16: + return AMD_ELEMENT_16_BYTES; + default: + llvm_unreachable("invalid private_element_size"); + } } void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, const SIProgramInfo &KernelInfo) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); amd_kernel_code_t header; AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits()); @@ -606,6 +652,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, (KernelInfo.ComputePGMRSrc2 << 32); header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + AMD_HSA_BITS_SET(header.code_properties, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, + getElementByteSizeValue(STM.getMaxPrivateElementSize())); + if (MFI->hasPrivateSegmentBuffer()) { header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; @@ -646,6 +697,9 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + if (STM.debuggerSupported()) + header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; + if (STM.isXNACKEnabled()) header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; @@ -654,9 +708,20 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; + header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + + if (STM.debuggerEmitPrologue()) { + header.debug_wavefront_private_segment_offset_sgpr = + KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + header.debug_private_segment_buffer_sgpr = + KernelInfo.DebuggerPrivateSegmentBufferSGPR; + } AMDGPUTargetStreamer *TS = static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); TS->EmitAMDKernelCodeT(header); } @@ -680,3 +745,227 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo()); return false; } + +// Emit a key and an integer value for runtime metadata. +static void emitRuntimeMDIntValue(std::unique_ptr<MCStreamer> &Streamer, + RuntimeMD::Key K, uint64_t V, + unsigned Size) { + Streamer->EmitIntValue(K, 1); + Streamer->EmitIntValue(V, Size); +} + +// Emit a key and a string value for runtime metadata. +static void emitRuntimeMDStringValue(std::unique_ptr<MCStreamer> &Streamer, + RuntimeMD::Key K, StringRef S) { + Streamer->EmitIntValue(K, 1); + Streamer->EmitIntValue(S.size(), 4); + Streamer->EmitBytes(S); +} + +// Emit a key and three integer values for runtime metadata. +// The three integer values are obtained from MDNode \p Node; +static void emitRuntimeMDThreeIntValues(std::unique_ptr<MCStreamer> &Streamer, + RuntimeMD::Key K, MDNode *Node, + unsigned Size) { + Streamer->EmitIntValue(K, 1); + Streamer->EmitIntValue(mdconst::extract<ConstantInt>( + Node->getOperand(0))->getZExtValue(), Size); + Streamer->EmitIntValue(mdconst::extract<ConstantInt>( + Node->getOperand(1))->getZExtValue(), Size); + Streamer->EmitIntValue(mdconst::extract<ConstantInt>( + Node->getOperand(2))->getZExtValue(), Size); +} + +void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) { + OutStreamer->SwitchSection(getObjFileLowering().getContext() + .getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0)); + + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion, + RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2); + if (auto MD = M.getNamedMetadata("opencl.ocl.version")) { + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage, + RuntimeMD::OpenCL_C, 1); + auto Node = MD->getOperand(0); + unsigned short Major = mdconst::extract<ConstantInt>(Node->getOperand(0)) + ->getZExtValue(); + unsigned short Minor = mdconst::extract<ConstantInt>(Node->getOperand(1)) + ->getZExtValue(); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion, + Major * 100 + Minor * 10, 2); + } +} + +static std::string getOCLTypeName(Type *Ty, bool isSigned) { + if (VectorType* VecTy = dyn_cast<VectorType>(Ty)) { + Type* EleTy = VecTy->getElementType(); + unsigned Size = VecTy->getVectorNumElements(); + return (Twine(getOCLTypeName(EleTy, isSigned)) + Twine(Size)).str(); + } + switch (Ty->getTypeID()) { + case Type::HalfTyID: return "half"; + case Type::FloatTyID: return "float"; + case Type::DoubleTyID: return "double"; + case Type::IntegerTyID: { + if (!isSigned) + return (Twine('u') + Twine(getOCLTypeName(Ty, true))).str(); + auto IntTy = cast<IntegerType>(Ty); + auto BW = IntTy->getIntegerBitWidth(); + switch (BW) { + case 8: + return "char"; + case 16: + return "short"; + case 32: + return "int"; + case 64: + return "long"; + default: + return (Twine('i') + Twine(BW)).str(); + } + } + default: + llvm_unreachable("invalid type"); + } +} + +static RuntimeMD::KernelArg::ValueType getRuntimeMDValueType( + Type *Ty, StringRef TypeName) { + if (auto VT = dyn_cast<VectorType>(Ty)) + return getRuntimeMDValueType(VT->getElementType(), TypeName); + else if (auto PT = dyn_cast<PointerType>(Ty)) + return getRuntimeMDValueType(PT->getElementType(), TypeName); + else if (Ty->isHalfTy()) + return RuntimeMD::KernelArg::F16; + else if (Ty->isFloatTy()) + return RuntimeMD::KernelArg::F32; + else if (Ty->isDoubleTy()) + return RuntimeMD::KernelArg::F64; + else if (IntegerType* intTy = dyn_cast<IntegerType>(Ty)) { + bool Signed = !TypeName.startswith("u"); + switch (intTy->getIntegerBitWidth()) { + case 8: + return Signed ? RuntimeMD::KernelArg::I8 : RuntimeMD::KernelArg::U8; + case 16: + return Signed ? RuntimeMD::KernelArg::I16 : RuntimeMD::KernelArg::U16; + case 32: + return Signed ? RuntimeMD::KernelArg::I32 : RuntimeMD::KernelArg::U32; + case 64: + return Signed ? RuntimeMD::KernelArg::I64 : RuntimeMD::KernelArg::U64; + default: + // Runtime does not recognize other integer types. Report as + // struct type. + return RuntimeMD::KernelArg::Struct; + } + } else + return RuntimeMD::KernelArg::Struct; +} + +void AMDGPUAsmPrinter::emitRuntimeMetadata(const Function &F) { + if (!F.getMetadata("kernel_arg_type")) + return; + + MCContext &Context = getObjFileLowering().getContext(); + OutStreamer->SwitchSection( + Context.getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0)); + OutStreamer->EmitIntValue(RuntimeMD::KeyKernelBegin, 1); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyKernelName, F.getName()); + + for (auto &Arg:F.args()) { + // Emit KeyArgBegin. + unsigned I = Arg.getArgNo(); + OutStreamer->EmitIntValue(RuntimeMD::KeyArgBegin, 1); + + // Emit KeyArgSize and KeyArgAlign. + auto T = Arg.getType(); + auto DL = F.getParent()->getDataLayout(); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgSize, + DL.getTypeAllocSize(T), 4); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAlign, + DL.getABITypeAlignment(T), 4); + + // Emit KeyArgTypeName. + auto TypeName = dyn_cast<MDString>(F.getMetadata( + "kernel_arg_type")->getOperand(I))->getString(); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgTypeName, TypeName); + + // Emit KeyArgName. + if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) { + auto ArgName = cast<MDString>(ArgNameMD->getOperand( + I))->getString(); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgName, ArgName); + } + + // Emit KeyArgIsVolatile, KeyArgIsRestrict, KeyArgIsConst and KeyArgIsPipe. + auto TypeQual = cast<MDString>(F.getMetadata( + "kernel_arg_type_qual")->getOperand(I))->getString(); + SmallVector<StringRef, 1> SplitQ; + TypeQual.split(SplitQ, " ", -1, false/* drop empty entry*/); + for (auto &I:SplitQ) { + auto Key = StringSwitch<RuntimeMD::Key>(I) + .Case("volatile", RuntimeMD::KeyArgIsVolatile) + .Case("restrict", RuntimeMD::KeyArgIsRestrict) + .Case("const", RuntimeMD::KeyArgIsConst) + .Case("pipe", RuntimeMD::KeyArgIsPipe) + .Default(RuntimeMD::KeyNull); + OutStreamer->EmitIntValue(Key, 1); + } + + // Emit KeyArgTypeKind. + auto BaseTypeName = cast<MDString>( + F.getMetadata("kernel_arg_base_type")->getOperand(I))->getString(); + auto TypeKind = StringSwitch<RuntimeMD::KernelArg::TypeKind>(BaseTypeName) + .Case("sampler_t", RuntimeMD::KernelArg::Sampler) + .Case("queue_t", RuntimeMD::KernelArg::Queue) + .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t", + "image2d_t" , "image2d_array_t", RuntimeMD::KernelArg::Image) + .Cases("image2d_depth_t", "image2d_array_depth_t", + "image2d_msaa_t", "image2d_array_msaa_t", + "image2d_msaa_depth_t", RuntimeMD::KernelArg::Image) + .Cases("image2d_array_msaa_depth_t", "image3d_t", + RuntimeMD::KernelArg::Image) + .Default(isa<PointerType>(T) ? RuntimeMD::KernelArg::Pointer : + RuntimeMD::KernelArg::Value); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgTypeKind, TypeKind, 1); + + // Emit KeyArgValueType. + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgValueType, + getRuntimeMDValueType(T, BaseTypeName), 2); + + // Emit KeyArgAccQual. + auto AccQual = cast<MDString>(F.getMetadata( + "kernel_arg_access_qual")->getOperand(I))->getString(); + auto AQ = StringSwitch<RuntimeMD::KernelArg::AccessQualifer>(AccQual) + .Case("read_only", RuntimeMD::KernelArg::ReadOnly) + .Case("write_only", RuntimeMD::KernelArg::WriteOnly) + .Case("read_write", RuntimeMD::KernelArg::ReadWrite) + .Default(RuntimeMD::KernelArg::None); + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAccQual, + AQ, 1); + + // Emit KeyArgAddrQual. + if (isa<PointerType>(T)) + emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAddrQual, + T->getPointerAddressSpace(), 1); + + // Emit KeyArgEnd + OutStreamer->EmitIntValue(RuntimeMD::KeyArgEnd, 1); + } + + // Emit KeyReqdWorkGroupSize, KeyWorkGroupSizeHint, and KeyVecTypeHint. + if (auto RWGS = F.getMetadata("reqd_work_group_size")) + emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyReqdWorkGroupSize, + RWGS, 4); + if (auto WGSH = F.getMetadata("work_group_size_hint")) + emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyWorkGroupSizeHint, + WGSH, 4); + if (auto VTH = F.getMetadata("vec_type_hint")) { + auto TypeName = getOCLTypeName(cast<ValueAsMetadata>( + VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>( + VTH->getOperand(1))->getZExtValue()); + emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyVecTypeHint, + TypeName); + } + + // Emit KeyKernelEnd + OutStreamer->EmitIntValue(RuntimeMD::KeyKernelEnd, 1); +} diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 99d4091670fe..7b04c539520d 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -12,15 +12,15 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H -#define LLVM_LIB_TARGET_R600_AMDGPUASMPRINTER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H #include "llvm/CodeGen/AsmPrinter.h" #include <vector> namespace llvm { -class AMDGPUAsmPrinter : public AsmPrinter { +class AMDGPUAsmPrinter final : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : @@ -40,6 +40,10 @@ private: NumVGPR(0), NumSGPR(0), FlatUsed(false), + ReservedVGPRFirst(0), + ReservedVGPRCount(0), + DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1), + DebuggerPrivateSegmentBufferSGPR((uint16_t)-1), VCCUsed(false), CodeLen(0) {} @@ -67,6 +71,20 @@ private: uint32_t LDSSize; bool FlatUsed; + // If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first + // fixed VGPR number reserved. + uint16_t ReservedVGPRFirst; + // The number of consecutive VGPRs reserved. + uint16_t ReservedVGPRCount; + + // Fixed SGPR number used to hold wave scratch offset for entire kernel + // execution, or uint16_t(-1) if the register is not used or not known. + uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR; + // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire + // kernel execution, or uint16_t(-1) if the register is not used or not + // known. + uint16_t DebuggerPrivateSegmentBufferSGPR; + // Bonus information for debugging. bool VCCUsed; uint64_t CodeLen; @@ -109,6 +127,10 @@ public: unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; + void emitStartOfRuntimeMetadata(const Module &M); + + void emitRuntimeMetadata(const Function &F); + protected: std::vector<std::string> DisasmLines, HexLines; size_t DisasmLineMaxLen; diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp new file mode 100644 index 000000000000..1a1da8a254a7 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -0,0 +1,42 @@ +//===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the lowering of LLVM calls to machine code calls for +/// GlobalISel. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPUCallLowering.h" +#include "AMDGPUISelLowering.h" + +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +#ifndef LLVM_BUILD_GLOBAL_ISEL +#error "This shouldn't be built without GISel" +#endif + +AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) + : CallLowering(&TLI) { +} + +bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, + const Value *Val, unsigned VReg) const { + return true; +} + +bool AMDGPUCallLowering::lowerFormalArguments( + MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args, + const SmallVectorImpl<unsigned> &VRegs) const { + // TODO: Implement once there are generic loads/stores. + return true; +} diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h new file mode 100644 index 000000000000..61174bacdac3 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -0,0 +1,36 @@ +//===- lib/Target/AMDGPU/AMDGPUCallLowering.h - Call lowering -*- C++ -*---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file describes how to lower LLVM calls to machine code calls. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H + +#include "llvm/CodeGen/GlobalISel/CallLowering.h" + +namespace llvm { + +class AMDGPUTargetLowering; + +class AMDGPUCallLowering: public CallLowering { + public: + AMDGPUCallLowering(const AMDGPUTargetLowering &TLI); + + bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val, + unsigned VReg) const override; + bool + lowerFormalArguments(MachineIRBuilder &MIRBuilder, + const Function::ArgumentListType &Args, + const SmallVectorImpl<unsigned> &VRegs) const override; +}; +} // End of namespace llvm; +#endif diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td index b0db26124a0c..47dfa4992068 100644 --- a/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -110,21 +110,19 @@ def CC_R600 : CallingConv<[ // Calling convention for compute kernels def CC_AMDGPU_Kernel : CallingConv<[ - CCCustom<"allocateStack"> + CCCustom<"allocateKernArg"> ]>; def CC_AMDGPU : CallingConv<[ CCIf<"static_cast<const AMDGPUSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() >=" "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()" - "->getShaderType() == ShaderType::COMPUTE", + "!AMDGPU::isShader(State.getCallingConv())", CCDelegateTo<CC_AMDGPU_Kernel>>, CCIf<"static_cast<const AMDGPUSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() < " "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()" - "->getShaderType() == ShaderType::COMPUTE", + "!AMDGPU::isShader(State.getCallingConv())", CCDelegateTo<CC_AMDGPU_Kernel>>, CCIf<"static_cast<const AMDGPUSubtarget&>" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp new file mode 100644 index 000000000000..3b415774df49 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -0,0 +1,82 @@ +//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass does misc. AMDGPU optimizations on IR before instruction +/// selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" + +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-codegenprepare" + +using namespace llvm; + +namespace { + +class AMDGPUCodeGenPrepare : public FunctionPass, + public InstVisitor<AMDGPUCodeGenPrepare> { + DivergenceAnalysis *DA; + const TargetMachine *TM; + +public: + static char ID; + AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : + FunctionPass(ID), + TM(TM) { } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "AMDGPU IR optimizations"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DivergenceAnalysis>(); + AU.setPreservesAll(); + } +}; + +} // End anonymous namespace + +bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { + return false; +} + +bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { + if (!TM || skipFunction(F)) + return false; + + DA = &getAnalysis<DivergenceAnalysis>(); + visit(F); + + return true; +} + +INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, + "AMDGPU IR optimizations", false, false) + +char AMDGPUCodeGenPrepare::ID = 0; + +FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) { + return new AMDGPUCodeGenPrepare(TM); +} diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp deleted file mode 100644 index 2f6b3022dd6e..000000000000 --- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp +++ /dev/null @@ -1,26 +0,0 @@ -//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPUDiagnosticInfoUnsupported.h" - -using namespace llvm; - -DiagnosticInfoUnsupported::DiagnosticInfoUnsupported( - const Function &Fn, - const Twine &Desc, - DiagnosticSeverity Severity) - : DiagnosticInfo(getKindID(), Severity), - Description(Desc), - Fn(Fn) { } - -int DiagnosticInfoUnsupported::KindID = 0; - -void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const { - DP << "unsupported " << getDescription() << " in " << Fn.getName(); -} diff --git a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h deleted file mode 100644 index 0fd37e1ede6b..000000000000 --- a/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h +++ /dev/null @@ -1,48 +0,0 @@ -//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H -#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H - -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" - -namespace llvm { - -/// Diagnostic information for unimplemented or unsupported feature reporting. -class DiagnosticInfoUnsupported : public DiagnosticInfo { -private: - const Twine &Description; - const Function &Fn; - - static int KindID; - - static int getKindID() { - if (KindID == 0) - KindID = llvm::getNextAvailablePluginDiagnosticKind(); - return KindID; - } - -public: - DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, - DiagnosticSeverity Severity = DS_Error); - - const Function &getFunction() const { return Fn; } - const Twine &getDescription() const { return Description; } - - void print(DiagnosticPrinter &DP) const override; - - static bool classof(const DiagnosticInfo *DI) { - return DI->getKind() == getKindID(); - } -}; - -} - -#endif diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index 4d84d281d998..bbc28b885721 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -7,12 +7,13 @@ // //==-----------------------------------------------------------------------===// // -// Interface to describe a layout of a stack frame on a AMDIL target machine +// Interface to describe a layout of a stack frame on a AMDGPU target machine. // //===----------------------------------------------------------------------===// #include "AMDGPUFrameLowering.h" #include "AMDGPURegisterInfo.h" -#include "R600MachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" + #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Instructions.h" @@ -57,7 +58,7 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { // T2.Y = stack[1].y // T3.X = stack[1].z // T3.Y = stack[1].w - // + // // StackWidth = 4: // T0.X = stack[0].x // T0.Y = stack[0].y @@ -75,7 +76,8 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); - const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + const AMDGPURegisterInfo *RI + = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo(); // Fill in FrameReg output argument. FrameReg = RI->getFrameRegister(MF); @@ -87,32 +89,16 @@ int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, int UpperBound = FI == -1 ? MFI->getNumObjects() : FI; for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) { - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(i)); + OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(i)); OffsetBytes += MFI->getObjectSize(i); // Each register holds 4 bytes, so we must always align the offset to at // least 4 bytes, so that 2 frame objects won't share the same register. - OffsetBytes = RoundUpToAlignment(OffsetBytes, 4); + OffsetBytes = alignTo(OffsetBytes, 4); } if (FI != -1) - OffsetBytes = RoundUpToAlignment(OffsetBytes, MFI->getObjectAlignment(FI)); + OffsetBytes = alignTo(OffsetBytes, MFI->getObjectAlignment(FI)); return OffsetBytes / (getStackWidth(MF) * 4); } -const TargetFrameLowering::SpillSlot * -AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const { - NumEntries = 0; - return nullptr; -} -void AMDGPUFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} -void -AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { -} - -bool -AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const { - return false; -} diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 257a3da40589..513848a1d887 100644 --- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -32,13 +32,13 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; + int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; - const SpillSlot * - getCalleeSavedSpillSlots(unsigned &NumEntries) const override; - void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - bool hasFP(const MachineFunction &MF) const override; + + bool hasFP(const MachineFunction &MF) const override { + return false; + } }; } // namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b33040b4d06a..23c9352ce273 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1,4 +1,4 @@ -//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +//===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===// // // The LLVM Compiler Infrastructure // @@ -12,30 +12,44 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUInstrInfo.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD -#include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" -#include "R600InstrInfo.h" -#include "SIDefines.h" #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/Function.h" +#include "llvm/IR/DiagnosticInfo.h" using namespace llvm; +namespace llvm { +class R600InstrInfo; +} + //===----------------------------------------------------------------------===// // Instruction Selector Implementation //===----------------------------------------------------------------------===// namespace { + +static bool isCBranchSCC(const SDNode *N) { + assert(N->getOpcode() == ISD::BRCOND); + if (!N->hasOneUse()) + return false; + + SDValue Cond = N->getOperand(1); + if (Cond.getOpcode() == ISD::CopyToReg) + Cond = Cond.getOperand(2); + return Cond.getOpcode() == ISD::SETCC && + Cond.getOperand(0).getValueType() == MVT::i32 && Cond.hasOneUse(); +} + /// AMDGPU specific code to select AMDGPU machine instructions for /// SelectionDAG operations. class AMDGPUDAGToDAGISel : public SelectionDAGISel { @@ -47,7 +61,7 @@ public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); bool runOnMachineFunction(MachineFunction &MF) override; - SDNode *Select(SDNode *N) override; + void Select(SDNode *N) override; const char *getPassName() const override; void PreprocessISelDAG() override; void PostprocessISelDAG() override; @@ -59,28 +73,8 @@ private: bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &); - // Complex pattern selectors - bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); - bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); - bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); - - static bool checkType(const Value *ptr, unsigned int addrspace); - static bool checkPrivateAddress(const MachineMemOperand *Op); - - static bool isGlobalStore(const StoreSDNode *N); - static bool isFlatStore(const StoreSDNode *N); - static bool isPrivateStore(const StoreSDNode *N); - static bool isLocalStore(const StoreSDNode *N); - static bool isRegionStore(const StoreSDNode *N); - - bool isCPLoad(const LoadSDNode *N) const; - bool isConstantLoad(const LoadSDNode *N, int cbID) const; - bool isGlobalLoad(const LoadSDNode *N) const; - bool isFlatLoad(const LoadSDNode *N) const; - bool isParamLoad(const LoadSDNode *N) const; - bool isPrivateLoad(const LoadSDNode *N) const; - bool isLocalLoad(const LoadSDNode *N) const; - bool isRegionLoad(const LoadSDNode *N) const; + bool isConstantLoad(const MemSDNode *N, int cbID) const; + bool isUniformBr(const SDNode *N) const; SDNode *glueCopyToM0(SDNode *N) const; @@ -111,7 +105,20 @@ private: SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, - SDValue &Offset, SDValue &GLC) const; + SDValue &Offset, SDValue &SLC) const; + bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, + SDValue &Offset) const; + bool SelectMUBUFConstant(SDValue Constant, + SDValue &SOffset, + SDValue &ImmOffset) const; + bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset, + SDValue &ImmOffset) const; + bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, + SDValue &ImmOffset, SDValue &VOffset) const; + + bool SelectFlat(SDValue Addr, SDValue &VAddr, + SDValue &SLC, SDValue &TFE) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, @@ -122,7 +129,7 @@ private: bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; - SDNode *SelectAddrSpaceCast(SDNode *N); + bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -136,13 +143,15 @@ private: SDValue &Clamp, SDValue &Omod) const; - SDNode *SelectADD_SUB_I64(SDNode *N); - SDNode *SelectDIV_SCALE(SDNode *N); + void SelectADD_SUB_I64(SDNode *N); + void SelectDIV_SCALE(SDNode *N); - SDNode *getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, + SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); - SDNode *SelectS_BFEFromShifts(SDNode *N); - SDNode *SelectS_BFE(SDNode *N); + void SelectS_BFEFromShifts(SDNode *N); + void SelectS_BFE(SDNode *N); + void SelectBRCOND(SDNode *N); + void SelectATOMIC_CMP_SWAP(SDNode *N); // Include the pieces autogenerated from the target description. #include "AMDGPUGenDAGISel.inc" @@ -159,7 +168,7 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM) : SelectionDAGISel(TM) {} bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { - Subtarget = &static_cast<const AMDGPUSubtarget &>(MF.getSubtarget()); + Subtarget = &MF.getSubtarget<AMDGPUSubtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -207,64 +216,9 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, } } -bool AMDGPUDAGToDAGISel::SelectADDRParam( - SDValue Addr, SDValue& R1, SDValue& R2) { - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32); - } - return true; -} - -bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - return SelectADDRParam(Addr, R1, R2); -} - - -bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { - if (Addr.getOpcode() == ISD::TargetExternalSymbol || - Addr.getOpcode() == ISD::TargetGlobalAddress) { - return false; - } - - if (Addr.getOpcode() == ISD::FrameIndex) { - if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { - R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - } else if (Addr.getOpcode() == ISD::ADD) { - R1 = Addr.getOperand(0); - R2 = Addr.getOperand(1); - } else { - R1 = Addr; - R2 = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i64); - } - return true; -} - SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - !checkType(cast<MemSDNode>(N)->getMemOperand()->getValue(), - AMDGPUAS::LOCAL_ADDRESS)) + cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) return N; const SITargetLowering& Lowering = @@ -304,14 +258,15 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { llvm_unreachable("invalid vector size"); } -SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { +void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { N->setNodeId(-1); - return nullptr; // Already selected. + return; // Already selected. } - if (isa<AtomicSDNode>(N)) + if (isa<AtomicSDNode>(N) || + (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC)) N = glueCopyToM0(N); switch (Opc) { @@ -325,7 +280,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; - return SelectADD_SUB_I64(N); + SelectADD_SUB_I64(N); + return; } case ISD::SCALAR_TO_VECTOR: case AMDGPUISD::BUILD_VERTICAL_VECTOR: @@ -359,8 +315,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); if (NumVectorElts == 1) { - return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, - N->getOperand(0), RegClass); + CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0), + RegClass); + return; } assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not " @@ -400,8 +357,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { if (!IsRegSeq) break; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), - RegSeqArgs); + CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); + return; } case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; @@ -422,8 +379,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { } const SDValue Ops[] = { RC, N->getOperand(0), SubReg0, N->getOperand(1), SubReg1 }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + N->getValueType(0), Ops)); + return; } case ISD::Constant: @@ -452,8 +410,9 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) }; - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops); + ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, + N->getValueType(0), Ops)); + return; } case ISD::LOAD: case ISD::STORE: { @@ -487,11 +446,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { uint32_t OffsetVal = Offset->getZExtValue(); uint32_t WidthVal = Width->getZExtValue(); - return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), - N->getOperand(0), OffsetVal, WidthVal); + ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, + SDLoc(N), N->getOperand(0), OffsetVal, WidthVal)); + return; } case AMDGPUISD::DIV_SCALE: { - return SelectDIV_SCALE(N); + SelectDIV_SCALE(N); + return; } case ISD::CopyToReg: { const SITargetLowering& Lowering = @@ -499,139 +460,48 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { Lowering.legalizeTargetIndependentNode(N, *CurDAG); break; } - case ISD::ADDRSPACECAST: - return SelectAddrSpaceCast(N); case ISD::AND: case ISD::SRL: case ISD::SRA: + case ISD::SIGN_EXTEND_INREG: if (N->getValueType(0) != MVT::i32 || Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) break; - return SelectS_BFE(N); + SelectS_BFE(N); + return; + case ISD::BRCOND: + SelectBRCOND(N); + return; + + case AMDGPUISD::ATOMIC_CMP_SWAP: + SelectATOMIC_CMP_SWAP(N); + return; } - return SelectCode(N); + SelectCode(N); } -bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { - assert(AS != 0 && "Use checkPrivateAddress instead."); - if (!Ptr) +bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { + if (!N->readMem()) return false; - - return Ptr->getType()->getPointerAddressSpace() == AS; -} - -bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) { - if (Op->getPseudoValue()) - return true; - - if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType())) - return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; - - return false; -} - -bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { - const Value *MemVal = N->getMemOperand()->getValue(); - return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS)); -} - -bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const { - const Value *MemVal = N->getMemOperand()->getValue(); if (CbId == -1) - return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS); + return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; - return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId); + return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId; } -bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const { - if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS || - N->getMemoryVT().bitsLT(MVT::i32)) - return true; - - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isLocalLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isFlatLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::FLAT_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isRegionLoad(const LoadSDNode *N) const { - return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS); -} - -bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { - MachineMemOperand *MMO = N->getMemOperand(); - if (checkPrivateAddress(N->getMemOperand())) { - if (MMO) { - const PseudoSourceValue *PSV = MMO->getPseudoValue(); - if (PSV && PSV->isConstantPool()) { - return true; - } - } - } - return false; -} - -bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const { - if (checkPrivateAddress(N->getMemOperand())) { - // Check to make sure we are not a constant pool load or a constant load - // that is marked as a private load - if (isCPLoad(N) || isConstantLoad(N, -1)) { - return false; - } - } - - const Value *MemVal = N->getMemOperand()->getValue(); - if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) && - !checkType(MemVal, AMDGPUAS::FLAT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) && - !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) && - !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)) { - return true; - } - return false; +bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { + const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + const Instruction *Term = BB->getTerminator(); + return Term->getMetadata("amdgpu.uniform") || + Term->getMetadata("structurizecfg.uniform"); } const char *AMDGPUDAGToDAGISel::getPassName() const { return "AMDGPU DAG->DAG Pattern Instruction Selection"; } -#ifdef DEBUGTMP -#undef INT64_C -#endif -#undef DEBUGTMP - //===----------------------------------------------------------------------===// // Complex Patterns //===----------------------------------------------------------------------===// @@ -705,7 +575,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base, return true; } -SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -728,7 +598,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); SDValue AddLoArgs[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) }; - unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; @@ -745,12 +614,12 @@ SDNode *AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDValue(AddHi,0), Sub1, }; - return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); + CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. -SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { SDLoc SL(N); EVT VT = N->getValueType(0); @@ -766,7 +635,7 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]); SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]); - return CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); + CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops); } bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, @@ -786,6 +655,7 @@ bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, SDValue &Offset) const { + SDLoc DL(Addr); if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); @@ -793,7 +663,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, if (isDSOffsetLegal(N0, C1->getSExtValue(), 16)) { // (add n0, c0) Base = N0; - Offset = N1; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); return true; } } else if (Addr.getOpcode() == ISD::SUB) { @@ -801,7 +671,6 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { int64_t ByteOffset = C->getSExtValue(); if (isUInt<16>(ByteOffset)) { - SDLoc DL(Addr); SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); // XXX - This is kind of hacky. Create a dummy sub node so we can check @@ -816,7 +685,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Zero, Addr.getOperand(1)); Base = SDValue(MachineSub, 0); - Offset = Addr.getOperand(0); + Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16); return true; } } @@ -834,7 +703,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); Base = SDValue(MovZero, 0); - Offset = Addr; + Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; } } @@ -932,8 +801,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDLoc DL(Addr); - GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); - SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + if (!GLC.getNode()) + GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + if (!SLC.getNode()) + SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -961,9 +832,11 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, } if (isLegalMUBUFImmOffset(C1)) { - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } else if (isUInt<32>(C1->getZExtValue())) { + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + + if (isUInt<32>(C1->getZExtValue())) { // Illegal offset, store it in soffset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, @@ -1045,14 +918,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); + // Offsets in vaddr must be positive. - if (CurDAG->SignBitIsZero(N0)) { - ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - if (isLegalMUBUFImmOffset(C1)) { - VAddr = N0; - ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); + if (isLegalMUBUFImmOffset(C1)) { + VAddr = N0; + ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; } } @@ -1091,13 +963,118 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, + SDValue &Soffset, SDValue &Offset + ) const { + SDValue GLC, SLC, TFE; + + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); +} +bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, - SDValue &GLC) const { - SDValue SLC, TFE; + SDValue &SLC) const { + SDValue GLC, TFE; return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } +bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, + SDValue &SOffset, + SDValue &ImmOffset) const { + SDLoc DL(Constant); + uint32_t Imm = cast<ConstantSDNode>(Constant)->getZExtValue(); + uint32_t Overflow = 0; + + if (Imm >= 4096) { + if (Imm <= 4095 + 64) { + // Use an SOffset inline constant for 1..64 + Overflow = Imm - 4095; + Imm = 4095; + } else { + // Try to keep the same value in SOffset for adjacent loads, so that + // the corresponding register contents can be re-used. + // + // Load values with all low-bits set into SOffset, so that a larger + // range of values can be covered using s_movk_i32 + uint32_t High = (Imm + 1) & ~4095; + uint32_t Low = (Imm + 1) & 4095; + Imm = Low; + Overflow = High - 1; + } + } + + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && + Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + + ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16); + + if (Overflow <= 64) + SOffset = CurDAG->getTargetConstant(Overflow, DL, MVT::i32); + else + SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(Overflow, DL, MVT::i32)), + 0); + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, + SDValue &SOffset, + SDValue &ImmOffset) const { + SDLoc DL(Offset); + + if (!isa<ConstantSDNode>(Offset)) + return false; + + return SelectMUBUFConstant(Offset, SOffset, ImmOffset); +} + +bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, + SDValue &SOffset, + SDValue &ImmOffset, + SDValue &VOffset) const { + SDLoc DL(Offset); + + // Don't generate an unnecessary voffset for constant offsets. + if (isa<ConstantSDNode>(Offset)) { + SDValue Tmp1, Tmp2; + + // When necessary, use a voffset in <= CI anyway to work around a hardware + // bug. + if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS || + SelectMUBUFConstant(Offset, Tmp1, Tmp2)) + return false; + } + + if (CurDAG->isBaseWithConstantOffset(Offset)) { + SDValue N0 = Offset.getOperand(0); + SDValue N1 = Offset.getOperand(1); + if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 && + SelectMUBUFConstant(N1, SOffset, ImmOffset)) { + VOffset = N0; + return true; + } + } + + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + VOffset = Offset; + + return true; +} + +bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr, + SDValue &VAddr, + SDValue &SLC, + SDValue &TFE) const { + VAddr = Addr; + TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + return true; +} + /// /// \param EncodedOffset This is the immediate value that will be encoded /// directly into the instruction. On SI/CI the \p EncodedOffset @@ -1213,71 +1190,33 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, !isa<ConstantSDNode>(Offset); } -// FIXME: This is incorrect and only enough to be able to compile. -SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { - AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); - SDLoc DL(N); - - const MachineFunction &MF = CurDAG->getMachineFunction(); - DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(), - "addrspacecast not implemented"); - CurDAG->getContext()->diagnose(NotImplemented); - - assert(Subtarget->hasFlatAddressSpace() && - "addrspacecast only supported with flat address space!"); - - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || - ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && - "Can only cast to / from flat address space!"); - - // The flat instructions read the address as the index of the VGPR holding the - // address, so casting should just be reinterpreting the base VGPR, so just - // insert trunc / bitcast / zext. - - SDValue Src = ASC->getOperand(0); - EVT DestVT = ASC->getValueType(0); - EVT SrcVT = Src.getValueType(); - - unsigned SrcSize = SrcVT.getSizeInBits(); - unsigned DestSize = DestVT.getSizeInBits(); - - if (SrcSize > DestSize) { - assert(SrcSize == 64 && DestSize == 32); - return CurDAG->getMachineNode( - TargetOpcode::EXTRACT_SUBREG, - DL, - DestVT, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); - } - - if (DestSize > SrcSize) { - assert(SrcSize == 32 && DestSize == 64); - - // FIXME: This is probably wrong, we should never be defining - // a register class with both VGPRs and SGPRs - SDValue RC = CurDAG->getTargetConstant(AMDGPU::VS_64RegClassID, DL, - MVT::i32); +bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, + SDValue &Base, + SDValue &Offset) const { + SDLoc DL(Index); - const SDValue Ops[] = { - RC, - Src, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(0, DL, MVT::i32)), 0), - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; + if (CurDAG->isBaseWithConstantOffset(Index)) { + SDValue N0 = Index.getOperand(0); + SDValue N1 = Index.getOperand(1); + ConstantSDNode *C1 = cast<ConstantSDNode>(N1); - return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, - DL, N->getValueType(0), Ops); + // (add n0, c0) + Base = N0; + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32); + return true; } - assert(SrcSize == 64 && DestSize == 64); - return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode(); + if (isa<ConstantSDNode>(Index)) + return false; + + Base = Index; + Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); + return true; } -SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, - uint32_t Offset, uint32_t Width) { +SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, + SDValue Val, uint32_t Offset, + uint32_t Width) { // Transformation function, pack the offset and width of a BFE into // the format expected by the S_BFE_I32 / S_BFE_U32. In the second // source, bits [5:0] contain the offset and bits [22:16] the width. @@ -1287,7 +1226,7 @@ SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, SDLoc DL, SDValue Val, return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); } -SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) // Predicate: 0 < b <= c < 32 @@ -1304,14 +1243,15 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { bool Signed = N->getOpcode() == ISD::SRA; unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; - return getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), - CVal - BVal, 32 - CVal); + ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, + 32 - CVal)); + return; } } - return SelectCode(N); + SelectCode(N); } -SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { +void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { switch (N->getOpcode()) { case ISD::AND: if (N->getOperand(0).getOpcode() == ISD::SRL) { @@ -1328,8 +1268,9 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), Srl.getOperand(0), - ShiftVal, WidthVal); + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), + Srl.getOperand(0), ShiftVal, WidthVal)); + return; } } } @@ -1349,20 +1290,139 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) { if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - return getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), And.getOperand(0), - ShiftVal, WidthVal); + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), + And.getOperand(0), ShiftVal, WidthVal)); + return; } } - } else if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); + } else if (N->getOperand(0).getOpcode() == ISD::SHL) { + SelectS_BFEFromShifts(N); + return; + } break; case ISD::SRA: - if (N->getOperand(0).getOpcode() == ISD::SHL) - return SelectS_BFEFromShifts(N); + if (N->getOperand(0).getOpcode() == ISD::SHL) { + SelectS_BFEFromShifts(N); + return; + } break; + + case ISD::SIGN_EXTEND_INREG: { + // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8 + SDValue Src = N->getOperand(0); + if (Src.getOpcode() != ISD::SRL) + break; + + const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1)); + if (!Amt) + break; + + unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits(); + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), + Amt->getZExtValue(), Width)); + return; + } } - return SelectCode(N); + SelectCode(N); +} + +void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) { + SDValue Cond = N->getOperand(1); + + if (isCBranchSCC(N)) { + // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it. + SelectCode(N); + return; + } + + // The result of VOPC instructions is or'd against ~EXEC before it is + // written to vcc or another SGPR. This means that the value '1' is always + // written to the corresponding bit for results that are masked. In order + // to correctly check against vccz, we need to and VCC with the EXEC + // register in order to clear the value from the masked bits. + + SDLoc SL(N); + + SDNode *MaskedCond = + CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1, + CurDAG->getRegister(AMDGPU::EXEC, MVT::i1), + Cond); + SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC, + SDValue(MaskedCond, 0), + SDValue()); // Passing SDValue() adds a + // glue output. + CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other, + N->getOperand(2), // Basic Block + VCC.getValue(0), // Chain + VCC.getValue(1)); // Glue + return; +} + +// This is here because there isn't a way to use the generated sub0_sub1 as the +// subreg index to EXTRACT_SUBREG in tablegen. +void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) { + MemSDNode *Mem = cast<MemSDNode>(N); + unsigned AS = Mem->getAddressSpace(); + if (AS == AMDGPUAS::FLAT_ADDRESS) { + SelectCode(N); + return; + } + + MVT VT = N->getSimpleValueType(0); + bool Is32 = (VT == MVT::i32); + SDLoc SL(N); + + MachineSDNode *CmpSwap = nullptr; + if (Subtarget->hasAddr64()) { + SDValue SRsrc, VAddr, SOffset, Offset, GLC, SLC; + + if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) { + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_ADDR64 : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_ADDR64; + SDValue CmpVal = Mem->getOperand(2); + + // XXX - Do we care about glue operands? + + SDValue Ops[] = { + CmpVal, VAddr, SRsrc, SOffset, Offset, SLC, Mem->getChain() + }; + + CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); + } + } + + if (!CmpSwap) { + SDValue SRsrc, SOffset, Offset, SLC; + if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) { + unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET : + AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_RTN_OFFSET; + + SDValue CmpVal = Mem->getOperand(2); + SDValue Ops[] = { + CmpVal, SRsrc, SOffset, Offset, SLC, Mem->getChain() + }; + + CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops); + } + } + + if (!CmpSwap) { + SelectCode(N); + return; + } + + MachineSDNode::mmo_iterator MMOs = MF->allocateMemRefsArray(1); + *MMOs = Mem->getMemOperand(); + CmpSwap->setMemRefs(MMOs, MMOs + 1); + + unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1; + SDValue Extract + = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0)); + + ReplaceUses(SDValue(N, 0), Extract); + ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1)); + CurDAG->RemoveDeadNode(N); } bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src, @@ -1432,62 +1492,59 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, } void AMDGPUDAGToDAGISel::PreprocessISelDAG() { - bool Modified = false; - - // XXX - Other targets seem to be able to do this without a worklist. - SmallVector<LoadSDNode *, 8> LoadsToReplace; - SmallVector<StoreSDNode *, 8> StoresToReplace; - - for (SDNode &Node : CurDAG->allnodes()) { - if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) { - EVT VT = LD->getValueType(0); - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) - continue; - - // To simplify the TableGen patters, we replace all i64 loads with v2i32 - // loads. Alternatively, we could promote i64 loads to v2i32 during DAG - // legalization, however, so places (ExpandUnalignedLoad) in the DAG - // legalizer assume that if i64 is legal, so doing this promotion early - // can cause problems. - LoadsToReplace.push_back(LD); - } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) { - // Handle i64 stores here for the same reason mentioned above for loads. - SDValue Value = ST->getValue(); - if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) - continue; - StoresToReplace.push_back(ST); + MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo(); + + // Handle the perverse case where a frame index is being stored. We don't + // want to see multiple frame index operands on the same instruction since + // it complicates things and violates some assumptions about frame index + // lowering. + for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd(); + I != E; ++I) { + SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32); + + // It's possible that we have a frame index defined in the function that + // isn't used in this block. + if (FI.use_empty()) + continue; + + // Skip over the AssertZext inserted during lowering. + SDValue EffectiveFI = FI; + auto It = FI->use_begin(); + if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) { + EffectiveFI = SDValue(*It, 0); + It = EffectiveFI->use_begin(); } - } - - for (LoadSDNode *LD : LoadsToReplace) { - SDLoc SL(LD); - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); - Modified = true; - } - for (StoreSDNode *ST : StoresToReplace) { - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), - MVT::v2i32, ST->getValue()); - const SDValue StoreOps[] = { - ST->getChain(), - NewValue, - ST->getBasePtr(), - ST->getOffset() - }; + for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) { + SDUse &Use = It.getUse(); + SDNode *User = Use.getUser(); + unsigned OpIdx = It.getOperandNo(); + ++It; + + if (MemSDNode *M = dyn_cast<MemSDNode>(User)) { + unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1; + if (OpIdx == PtrIdx) + continue; + + unsigned OpN = M->getNumOperands(); + SDValue NewOps[8]; + + assert(OpN < array_lengthof(NewOps)); + for (unsigned Op = 0; Op != OpN; ++Op) { + if (Op != OpIdx) { + NewOps[Op] = M->getOperand(Op); + continue; + } + + MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, + SDLoc(M), MVT::i32, FI); + NewOps[Op] = SDValue(Mov, 0); + } - CurDAG->UpdateNodeOperands(ST, StoreOps); - Modified = true; + CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN)); + } + } } - - // XXX - Is this necessary? - if (Modified) - CurDAG->RemoveDeadNodes(); } void AMDGPUDAGToDAGISel::PostprocessISelDAG() { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 1a59a460ee7d..352423ed3ad6 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,7 +15,6 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" @@ -28,16 +27,19 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" - +#include "llvm/IR/DiagnosticInfo.h" +#include "SIInstrInfo.h" using namespace llvm; -static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - unsigned Offset = State.AllocateStack(ValVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); +static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + MachineFunction &MF = State.getMachineFunction(); + AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); + uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; } @@ -53,60 +55,104 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -// Type for a vector that will be loaded to. -EVT AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext &Ctx, EVT VT) { +EVT AMDGPUTargetLowering::getEquivalentBitType(LLVMContext &Ctx, EVT VT) { unsigned StoreSize = VT.getStoreSizeInBits(); if (StoreSize <= 32) - return EVT::getIntegerVT(Ctx, 32); + return EVT::getIntegerVT(Ctx, StoreSize); return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32); } -AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, +AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { - setOperationAction(ISD::Constant, MVT::i32, Legal); - setOperationAction(ISD::Constant, MVT::i64, Legal); - setOperationAction(ISD::ConstantFP, MVT::f32, Legal); - setOperationAction(ISD::ConstantFP, MVT::f64, Legal); + // Lower floating point store/load to integer store/load to reduce the number + // of patterns in tablegen. + setOperationAction(ISD::LOAD, MVT::f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction(ISD::LOAD, MVT::v2f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); - // This is totally unsupported, just custom lower to produce an error. - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); - // We need to custom lower some of the intrinsics - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::LOAD, MVT::v8f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); - // Library functions. These default to Expand, but we have instructions - // for them. - setOperationAction(ISD::FCEIL, MVT::f32, Legal); - setOperationAction(ISD::FEXP2, MVT::f32, Legal); - setOperationAction(ISD::FPOW, MVT::f32, Legal); - setOperationAction(ISD::FLOG2, MVT::f32, Legal); - setOperationAction(ISD::FABS, MVT::f32, Legal); - setOperationAction(ISD::FFLOOR, MVT::f32, Legal); - setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FTRUNC, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::LOAD, MVT::v16f32, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); - setOperationAction(ISD::FROUND, MVT::f32, Custom); - setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::LOAD, MVT::i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); - setOperationAction(ISD::FREM, MVT::f32, Custom); - setOperationAction(ISD::FREM, MVT::f64, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); + setOperationAction(ISD::LOAD, MVT::f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32); - // Expand to fneg + fadd. - setOperationAction(ISD::FSUB, MVT::f64, Expand); + setOperationAction(ISD::LOAD, MVT::v2f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); + + // There are no 64-bit extloads. These should be done as a 32-bit extload and + // an extension to 64-bit. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); + } + + for (MVT VT : MVT::integer_valuetypes()) { + if (VT == MVT::i64) + continue; + + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); + } + + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); + } + + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - // Lower floating point store/load to integer store/load to reduce the number - // of patterns in tablegen. setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); @@ -122,51 +168,99 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16f32, Promote); AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::i64, Promote); + AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::f64, Promote); - AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64); + AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v2f64, Promote); - AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64); + AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); - // Custom lowering of vector stores is required for local address space - // stores. - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom); - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom); - // XXX: This can be change to Custom, once ExpandVectorStores can - // handle 64-bit stores. + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i16, Expand); - setTruncStoreAction(MVT::i64, MVT::i8, Expand); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); + setTruncStoreAction(MVT::i64, MVT::i8, Expand); + setTruncStoreAction(MVT::i64, MVT::i16, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand); - setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); + setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); - setOperationAction(ISD::LOAD, MVT::f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); - setOperationAction(ISD::LOAD, MVT::v2f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32); + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); - setOperationAction(ISD::LOAD, MVT::v4f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32); + setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); + setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); - setOperationAction(ISD::LOAD, MVT::v8f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32); + setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); + setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); - setOperationAction(ISD::LOAD, MVT::v16f32, Promote); - AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32); - setOperationAction(ISD::LOAD, MVT::f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64); + setOperationAction(ISD::Constant, MVT::i32, Legal); + setOperationAction(ISD::Constant, MVT::i64, Legal); + setOperationAction(ISD::ConstantFP, MVT::f32, Legal); + setOperationAction(ISD::ConstantFP, MVT::f64, Legal); - setOperationAction(ISD::LOAD, MVT::v2f64, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + + // This is totally unsupported, just custom lower to produce an error. + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + + // We need to custom lower some of the intrinsics + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + + // Library functions. These default to Expand, but we have instructions + // for them. + setOperationAction(ISD::FCEIL, MVT::f32, Legal); + setOperationAction(ISD::FEXP2, MVT::f32, Legal); + setOperationAction(ISD::FPOW, MVT::f32, Legal); + setOperationAction(ISD::FLOG2, MVT::f32, Legal); + setOperationAction(ISD::FABS, MVT::f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::f32, Legal); + setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); + setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + + setOperationAction(ISD::FREM, MVT::f32, Custom); + setOperationAction(ISD::FREM, MVT::f64, Custom); + + // v_mad_f32 does not support denormals according to some sources. + if (!Subtarget->hasFP32Denormals()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); + + // Expand to fneg + fadd. + setOperationAction(ISD::FSUB, MVT::f64, Expand); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom); @@ -179,31 +273,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - // There are no 64-bit extloads. These should be done as a 32-bit extload and - // an extension to 64-bit. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand); - } - - setOperationAction(ISD::BR_CC, MVT::i1, Expand); - if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FCEIL, MVT::f64, Custom); setOperationAction(ISD::FTRUNC, MVT::f64, Custom); @@ -219,28 +288,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); - setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand); - setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand); - - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::SREM, VT, Expand); + // These should use [SU]DIVREM, so set them to expand setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); // GPU does not have divrem function for signed or unsigned. setOperationAction(ISD::SDIVREM, VT, Custom); @@ -284,17 +338,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, if (Subtarget->hasFFBH()) setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); - else - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - - if (!Subtarget->hasFFBL()) - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + if (Subtarget->hasFFBL()) + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal); setOperationAction(ISD::CTLZ, MVT::i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); + // We only really have 32-bit BFE instructions (and 16-bit on VI). + // + // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any + // effort to match them now. We want this to be false for i64 cases when the + // extraction isn't restricted to the upper or lower half. Ideally we would + // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that + // span the midpoint are probably relatively rare, so don't worry about them + // for now. + if (Subtarget->hasBFE()) + setHasExtractBitsInsn(true); + static const MVT::SimpleValueType VectorIntTypes[] = { MVT::v2i32, MVT::v4i32 }; @@ -334,9 +395,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } @@ -366,24 +425,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); - setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); } - setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); - setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); - - setTargetDAGCombine(ISD::SHL); - setTargetDAGCombine(ISD::MUL); - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::STORE); + // This causes using an unrolled select operation rather than expansion with + // bit operations. This is in general better, but the alternative using BFI + // instructions may be better if the select sources are SGPRs. + setOperationAction(ISD::SELECT, MVT::v2f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32); - setTargetDAGCombine(ISD::FADD); - setTargetDAGCombine(ISD::FSUB); + setOperationAction(ISD::SELECT, MVT::v4f32, Promote); + AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32); setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); @@ -394,7 +449,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, // SI at least has hardware support for floating point exceptions, but no way // of using or handling them is implemented. They are also optional in OpenCL // (Section 7.3) - setHasFloatingPointExceptions(false); + setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); setSelectIsExpensive(false); PredictableSelectIsExpensive = false; @@ -415,6 +470,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, MaxStoresPerMemcpy = 4096; MaxStoresPerMemmove = 4096; MaxStoresPerMemset = 4096; + + setTargetDAGCombine(ISD::BITCAST); + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); + setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FSUB); } //===----------------------------------------------------------------------===// @@ -467,15 +534,17 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy) const { - if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) - return true; - unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); - unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); - return ((LScalarSize <= CastScalarSize) || - (CastScalarSize >= 32) || - (LScalarSize < 32)); + if (LoadTy.getScalarType() == MVT::i32) + return false; + + unsigned LScalarSize = LoadTy.getScalarSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarSizeInBits(); + + return (LScalarSize < CastScalarSize) || + (CastScalarSize >= 32); } // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also @@ -578,14 +647,13 @@ void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, State.AnalyzeReturn(Outs, RetCC_SI); } -SDValue AMDGPUTargetLowering::LowerReturn( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const { - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain); +SDValue +AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { + return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain); } //===---------------------------------------------------------------------===// @@ -606,32 +674,38 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) FuncName = G->getGlobal()->getName(); - DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName); + DiagnosticInfoUnsupported NoCalls( + Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc()); DAG.getContext()->diagnose(NoCalls); - return SDValue(); + + for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) + InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + + return DAG.getEntryNode(); } SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { const Function &Fn = *DAG.getMachineFunction().getFunction(); - DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca"); + DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca", + SDLoc(Op).getDebugLoc()); DAG.getContext()->diagnose(NoDynamicAlloca); - return SDValue(); + auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)}; + return DAG.getMergeValues(Ops, SDLoc()); } SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: - Op.getNode()->dump(); + Op->dump(&DAG); llvm_unreachable("Custom lowering code for this" "instruction is not implemented yet!"); break; case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::UDIVREM: return LowerUDIVREM(Op, DAG); case ISD::SDIVREM: return LowerSDIVREM(Op, DAG); @@ -666,24 +740,6 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N, // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do // nothing here and let the illegal result integer be handled normally. return; - case ISD::LOAD: { - SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); - if (!Node) - return; - - Results.push_back(SDValue(Node, 0)); - Results.push_back(SDValue(Node, 1)); - // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode - // function - DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); - return; - } - case ISD::STORE: { - SDValue Lowered = LowerSTORE(SDValue(N, 0), DAG); - if (Lowered.getNode()) - Results.push_back(Lowered); - return; - } default: return; } @@ -712,16 +768,16 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getConstant(*CI, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, - false, TD.getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), + TD.getPrefTypeAlignment(InitTy)); } if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) { EVT VT = EVT::getEVT(CFP->getType()); PointerType *PtrTy = PointerType::get(CFP->getType(), 0); return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, DL, VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, - false, TD.getPrefTypeAlignment(CFP->getType())); + MachinePointerInfo(UndefValue::get(PtrTy)), + TD.getPrefTypeAlignment(CFP->getType())); } if (StructType *ST = dyn_cast<StructType>(InitTy)) { @@ -769,8 +825,8 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init, EVT VT = EVT::getEVT(InitTy); PointerType *PtrTy = PointerType::get(InitTy, AMDGPUAS::PRIVATE_ADDRESS); return DAG.getStore(Chain, DL, DAG.getUNDEF(VT), InitPtr, - MachinePointerInfo(UndefValue::get(PtrTy)), false, - false, TD.getPrefTypeAlignment(InitTy)); + MachinePointerInfo(UndefValue::get(PtrTy)), + TD.getPrefTypeAlignment(InitTy)); } Init->dump(); @@ -782,10 +838,7 @@ static bool hasDefinedInitializer(const GlobalValue *GV) { if (!GVar || !GVar->hasInitializer()) return false; - if (isa<UndefValue>(GVar->getInitializer())) - return false; - - return true; + return !isa<UndefValue>(GVar->getInitializer()); } SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, @@ -797,6 +850,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, const GlobalValue *GV = G->getGlobal(); switch (G->getAddressSpace()) { + case AMDGPUAS::CONSTANT_ADDRESS: { + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(G), ConstPtrVT); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(G), ConstPtrVT, GA); + } case AMDGPUAS::LOCAL_ADDRESS: { // XXX: What does the value of G->getOffset() mean? assert(G->getOffset() == 0 && @@ -808,11 +866,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, unsigned Offset; if (MFI->LocalMemoryObjects.count(GV) == 0) { - uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType()); - Offset = MFI->LDSSize; + unsigned Align = GV->getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV->getValueType()); + + /// TODO: We should sort these to minimize wasted space due to alignment + /// padding. Currently the padding is decided by the first encountered use + /// during lowering. + Offset = MFI->LDSSize = alignTo(MFI->LDSSize, Align); MFI->LocalMemoryObjects[GV] = Offset; - // XXX: Account for alignment? - MFI->LDSSize += Size; + MFI->LDSSize += DL.getTypeAllocSize(GV->getValueType()); } else { Offset = MFI->LocalMemoryObjects[GV]; } @@ -820,50 +883,11 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, return DAG.getConstant(Offset, SDLoc(Op), getPointerTy(DL, AMDGPUAS::LOCAL_ADDRESS)); } - case AMDGPUAS::CONSTANT_ADDRESS: { - MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); - Type *EltType = GV->getType()->getElementType(); - unsigned Size = DL.getTypeAllocSize(EltType); - unsigned Alignment = DL.getPrefTypeAlignment(EltType); - - MVT PrivPtrVT = getPointerTy(DL, AMDGPUAS::PRIVATE_ADDRESS); - MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); - - int FI = FrameInfo->CreateStackObject(Size, Alignment, false); - SDValue InitPtr = DAG.getFrameIndex(FI, PrivPtrVT); - - const GlobalVariable *Var = cast<GlobalVariable>(GV); - if (!Var->hasInitializer()) { - // This has no use, but bugpoint will hit it. - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } - - const Constant *Init = Var->getInitializer(); - SmallVector<SDNode*, 8> WorkList; - - for (SDNode::use_iterator I = DAG.getEntryNode()->use_begin(), - E = DAG.getEntryNode()->use_end(); I != E; ++I) { - if (I->getOpcode() != AMDGPUISD::REGISTER_LOAD && I->getOpcode() != ISD::LOAD) - continue; - WorkList.push_back(*I); - } - SDValue Chain = LowerConstantInitializer(Init, GV, InitPtr, DAG.getEntryNode(), DAG); - for (SmallVector<SDNode*, 8>::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - SmallVector<SDValue, 8> Ops; - Ops.push_back(Chain); - for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) { - Ops.push_back((*I)->getOperand(i)); - } - DAG.UpdateNodeOperands(*I, Ops); - } - return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op), ConstPtrVT); - } } const Function &Fn = *DAG.getMachineFunction().getFunction(); - DiagnosticInfoUnsupported BadInit(Fn, - "initializer for address space"); + DiagnosticInfoUnsupported BadInit( + Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc()); DAG.getContext()->diagnose(BadInit); return SDValue(); } @@ -875,7 +899,7 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op, for (const SDUse &U : Op->ops()) DAG.ExtractVectorElements(U.get(), Args); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); + return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, @@ -887,23 +911,7 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args); -} - -SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, - SelectionDAG &DAG) const { - - MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = Subtarget->getFrameLowering(); - - FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); - - unsigned FrameIndex = FIN->getIndex(); - unsigned IgnoredFrameReg; - unsigned Offset = - TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); - return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), - Op.getValueType()); + return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); } SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, @@ -914,121 +922,10 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { default: return Op; - case AMDGPUIntrinsic::AMDGPU_abs: - case AMDGPUIntrinsic::AMDIL_abs: // Legacy name. - return LowerIntrinsicIABS(Op, DAG); - case AMDGPUIntrinsic::AMDGPU_lrp: - return LowerIntrinsicLRP(Op, DAG); - - case AMDGPUIntrinsic::AMDGPU_clamp: - case AMDGPUIntrinsic::AMDIL_clamp: // Legacy name. + case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name. return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::AMDGPU_div_scale: { - // 3rd parameter required to be a constant. - const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - if (!Param) - return DAG.getUNDEF(VT); - - // Translate to the operands expected by the machine instruction. The - // first parameter must be the same as the first instruction. - SDValue Numerator = Op.getOperand(1); - SDValue Denominator = Op.getOperand(2); - - // Note this order is opposite of the machine instruction's operations, - // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The - // intrinsic has the numerator as the first operand to match a normal - // division operation. - - SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; - - return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, - Denominator, Numerator); - } - - case Intrinsic::AMDGPU_div_fmas: - return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), - Op.getOperand(4)); - - case Intrinsic::AMDGPU_div_fixup: - return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case Intrinsic::AMDGPU_trig_preop: - return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case Intrinsic::AMDGPU_rcp: - return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq: - return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_legacy_rsq: - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - - case Intrinsic::AMDGPU_rsq_clamped: - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - Type *Type = VT.getTypeForEVT(*DAG.getContext()); - APFloat Max = APFloat::getLargest(Type->getFltSemantics()); - APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); - - SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, - DAG.getConstantFP(Max, DL, VT)); - return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, - DAG.getConstantFP(Min, DL, VT)); - } else { - return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1)); - } - - case Intrinsic::AMDGPU_ldexp: - return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imax: - return DAG.getNode(ISD::SMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umax: - return DAG.getNode(ISD::UMAX, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_imin: - return DAG.getNode(ISD::SMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_umin: - return DAG.getNode(ISD::UMIN, DL, VT, Op.getOperand(1), - Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umul24: - return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_imul24: - return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDGPU_umad24: - return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_imad24: - return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT, - Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3: - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_bfe_i32: return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1), @@ -1040,69 +937,13 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfi: - return DAG.getNode(AMDGPUISD::BFI, DL, VT, - Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3)); - - case AMDGPUIntrinsic::AMDGPU_bfm: - return DAG.getNode(AMDGPUISD::BFM, DL, VT, - Op.getOperand(1), - Op.getOperand(2)); - - case Intrinsic::AMDGPU_class: - return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, - Op.getOperand(1), Op.getOperand(2)); - - case AMDGPUIntrinsic::AMDIL_exp: // Legacy name. - return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1)); - - case AMDGPUIntrinsic::AMDIL_round_nearest: // Legacy name. - return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. - return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name - return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1)); } } -///IABS(a) = SMAX(sub(0, a), a) -SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - Op.getOperand(1)); - - return DAG.getNode(ISD::SMAX, DL, VT, Neg, Op.getOperand(1)); -} - -/// Linear Interpolation -/// LRP(a, b, c) = muladd(a, b, (1 - a) * c) -SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - // TODO: Should this propagate fast-math-flags? - SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, - DAG.getConstantFP(1.0f, DL, MVT::f32), - Op.getOperand(1)); - SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA, - Op.getOperand(3)); - return DAG.getNode(ISD::FADD, DL, VT, - DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), Op.getOperand(2)), - OneSubAC); -} - /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, +SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, + SDValue LHS, SDValue RHS, + SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const { if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) @@ -1176,56 +1017,48 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, return SDValue(); } -SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, - SelectionDAG &DAG) const { - LoadSDNode *Load = cast<LoadSDNode>(Op); - EVT MemVT = Load->getMemoryVT(); - EVT MemEltVT = MemVT.getVectorElementType(); +std::pair<SDValue, SDValue> +AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); - EVT LoadVT = Op.getValueType(); - EVT EltVT = LoadVT.getVectorElementType(); - EVT PtrVT = Load->getBasePtr().getValueType(); + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); - unsigned NumElts = Load->getMemoryVT().getVectorNumElements(); - SmallVector<SDValue, 8> Loads; - SmallVector<SDValue, 8> Chains; + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); - SDLoc SL(Op); - unsigned MemEltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(), - DAG.getConstant(i * MemEltSize, SL, PtrVT)); + return std::make_pair(Lo, Hi); +} - SDValue NewLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, EltVT, - Load->getChain(), Ptr, - SrcValue.getWithOffset(i * MemEltSize), - MemEltVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); - Loads.push_back(NewLoad.getValue(0)); - Chains.push_back(NewLoad.getValue(1)); - } +SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); - SDValue Ops[] = { - DAG.getNode(ISD::BUILD_VECTOR, SL, LoadVT, Loads), - DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains) - }; + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero); +} - return DAG.getMergeValues(Ops, SL); +SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + + SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One); } SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *Load = cast<LoadSDNode>(Op); EVT VT = Op.getValueType(); + // If this is a 2 element vector, we really want to scalarize and not create // weird 1 element vectors. if (VT.getVectorNumElements() == 2) - return ScalarizeVectorLoad(Op, DAG); + return scalarizeVectorLoad(Load, DAG); - LoadSDNode *Load = cast<LoadSDNode>(Op); SDValue BasePtr = Load->getBasePtr(); EVT PtrVT = BasePtr.getValueType(); EVT MemVT = Load->getMemoryVT(); @@ -1245,22 +1078,15 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, unsigned BaseAlign = Load->getAlignment(); unsigned HiAlign = MinAlign(BaseAlign, Size); - SDValue LoLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, - Load->getChain(), BasePtr, - SrcValue, - LoMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), BaseAlign); - + SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, + Load->getChain(), BasePtr, SrcValue, LoMemVT, + BaseAlign, Load->getMemOperand()->getFlags()); SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Size, SL, PtrVT)); - - SDValue HiLoad - = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, - Load->getChain(), HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), - HiMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), HiAlign); + SDValue HiLoad = + DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), + HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), + HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); SDValue Ops[] = { DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), @@ -1271,6 +1097,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, return DAG.getMergeValues(Ops, SL); } +// FIXME: This isn't doing anything for SI. This should be used in a target +// combine during type legalization. SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); @@ -1317,48 +1145,15 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op, if (PackedSize < 32) { EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize); return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr, - Store->getMemOperand()->getPointerInfo(), - PackedVT, - Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + Store->getMemOperand()->getPointerInfo(), PackedVT, + Store->getAlignment(), + Store->getMemOperand()->getFlags()); } return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr, Store->getMemOperand()->getPointerInfo(), - Store->isVolatile(), Store->isNonTemporal(), - Store->getAlignment()); -} - -SDValue AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op, - SelectionDAG &DAG) const { - StoreSDNode *Store = cast<StoreSDNode>(Op); - EVT MemEltVT = Store->getMemoryVT().getVectorElementType(); - EVT EltVT = Store->getValue().getValueType().getVectorElementType(); - EVT PtrVT = Store->getBasePtr().getValueType(); - unsigned NumElts = Store->getMemoryVT().getVectorNumElements(); - SDLoc SL(Op); - - SmallVector<SDValue, 8> Chains; - - unsigned EltSize = MemEltVT.getStoreSize(); - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); - - for (unsigned i = 0, e = NumElts; i != e; ++i) { - SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, - Store->getValue(), - DAG.getConstant(i, SL, MVT::i32)); - - SDValue Offset = DAG.getConstant(i * MemEltVT.getStoreSize(), SL, PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Store->getBasePtr(), Offset); - SDValue NewStore = - DAG.getTruncStore(Store->getChain(), SL, Val, Ptr, - SrcValue.getWithOffset(i * EltSize), - MemEltVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); - Chains.push_back(NewStore); - } - - return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains); + Store->getAlignment(), + Store->getMemOperand()->getFlags()); } SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, @@ -1370,7 +1165,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, // If this is a 2 element vector, we really want to scalarize and not create // weird 1 element vectors. if (VT.getVectorNumElements() == 2) - return ScalarizeVectorStore(Op, DAG); + return scalarizeVectorStore(Store, DAG); EVT MemVT = Store->getMemoryVT(); SDValue Chain = Store->getChain(); @@ -1395,171 +1190,21 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, unsigned Size = LoMemVT.getStoreSize(); unsigned HiAlign = MinAlign(BaseAlign, Size); - SDValue LoStore - = DAG.getTruncStore(Chain, SL, Lo, - BasePtr, - SrcValue, - LoMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - BaseAlign); - SDValue HiStore - = DAG.getTruncStore(Chain, SL, Hi, - HiPtr, - SrcValue.getWithOffset(Size), - HiMemVT, - Store->isNonTemporal(), - Store->isVolatile(), - HiAlign); + SDValue LoStore = + DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign, + Store->getMemOperand()->getFlags()); + SDValue HiStore = + DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), + HiMemVT, HiAlign, Store->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } - -SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - LoadSDNode *Load = cast<LoadSDNode>(Op); - ISD::LoadExtType ExtType = Load->getExtensionType(); - EVT VT = Op.getValueType(); - EVT MemVT = Load->getMemoryVT(); - - if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) { - assert(VT == MVT::i1 && "Only i1 non-extloads expected"); - // FIXME: Copied from PPC - // First, load into 32 bits, then truncate to 1 bit. - - SDValue Chain = Load->getChain(); - SDValue BasePtr = Load->getBasePtr(); - MachineMemOperand *MMO = Load->getMemOperand(); - - SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, - BasePtr, MVT::i8, MMO); - - SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD), - NewLD.getValue(1) - }; - - return DAG.getMergeValues(Ops, DL); - } - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS || - Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS || - ExtType == ISD::NON_EXTLOAD || Load->getMemoryVT().bitsGE(MVT::i32)) - return SDValue(); - - // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, - // register (2-)byte extract. - - // Get Register holding the target. - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), - DAG.getConstant(2, DL, MVT::i32)); - // Load the Register. - SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), - Load->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); - - // Get offset within the register. - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, - Load->getBasePtr(), - DAG.getConstant(0x3, DL, MVT::i32)); - - // Bit offset of target byte (byteIdx * 8). - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - // Shift to the right. - Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); - - // Eliminate the upper bits by setting them to ... - EVT MemEltVT = MemVT.getScalarType(); - - // ... ones. - if (ExtType == ISD::SEXTLOAD) { - SDValue MemEltVTNode = DAG.getValueType(MemEltVT); - - SDValue Ops[] = { - DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); - } - - // ... or zeros. - SDValue Ops[] = { - DAG.getZeroExtendInReg(Ret, DL, MemEltVT), - Load->getChain() - }; - - return DAG.getMergeValues(Ops, DL); -} - -SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); - SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG); - if (Result.getNode()) { - return Result; - } - - StoreSDNode *Store = cast<StoreSDNode>(Op); - SDValue Chain = Store->getChain(); - if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || - Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && - Store->getValue().getValueType().isVector()) { - return SplitVectorStore(Op, DAG); - } - - EVT MemVT = Store->getMemoryVT(); - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS && - MemVT.bitsLT(MVT::i32)) { - unsigned Mask = 0; - if (Store->getMemoryVT() == MVT::i8) { - Mask = 0xff; - } else if (Store->getMemoryVT() == MVT::i16) { - Mask = 0xffff; - } - SDValue BasePtr = Store->getBasePtr(); - SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, - DAG.getConstant(2, DL, MVT::i32)); - SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, - Chain, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - - SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, - DAG.getConstant(0x3, DL, MVT::i32)); - - SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, - DAG.getConstant(3, DL, MVT::i32)); - - SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, - Store->getValue()); - - SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); - - SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, - MaskedValue, ShiftAmt); - - SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, - DAG.getConstant(Mask, DL, MVT::i32), - ShiftAmt); - DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, - DAG.getConstant(0xffffffff, DL, MVT::i32)); - Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); - - SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); - return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, - Chain, Value, Ptr, - DAG.getTargetConstant(0, DL, MVT::i32)); - } - return SDValue(); -} - // This is a shortcut for integer division because we have fast i32<->f32 // conversions, and fast f32 reciprocal instructions. The fractional part of a -// float is enough to accurately represent up to a 24-bit integer. -SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { +// float is enough to accurately represent up to a 24-bit signed integer. +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, + bool Sign) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); @@ -1567,20 +1212,26 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool MVT IntVT = MVT::i32; MVT FltVT = MVT::f32; - ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; - ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS); + if (LHSSignBits < 9) + return SDValue(); - if (VT.isVector()) { - unsigned NElts = VT.getVectorNumElements(); - IntVT = MVT::getVectorVT(MVT::i32, NElts); - FltVT = MVT::getVectorVT(MVT::f32, NElts); - } + unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS); + if (RHSSignBits < 9) + return SDValue(); + + unsigned BitSize = VT.getSizeInBits(); + unsigned SignBits = std::min(LHSSignBits, RHSSignBits); + unsigned DivBits = BitSize - SignBits; + if (Sign) + ++DivBits; - unsigned BitSize = VT.getScalarType().getSizeInBits(); + ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; SDValue jq = DAG.getConstant(1, DL, IntVT); - if (sign) { + if (Sign) { // char|short jq = ia ^ ib; jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); @@ -1590,18 +1241,13 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // jq = jq | 0x1 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT)); - - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, IntVT); } // int ia = (int)LHS; - SDValue ia = sign ? - DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); + SDValue ia = LHS; // int ib, (int)RHS; - SDValue ib = sign ? - DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); + SDValue ib = RHS; // float fa = (float)ia; SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); @@ -1609,8 +1255,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // float fb = (float)ib; SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); - // TODO: Should this propagate fast-math-flags? - // float fq = native_divide(fa, fb); SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); @@ -1621,8 +1265,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq); // float fr = mad(fqneg, fb, fa); - SDValue fr = DAG.getNode(ISD::FADD, DL, FltVT, - DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); + SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); @@ -1641,9 +1284,6 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // jq = (cv ? jq : 0); jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT)); - // dst = trunc/extend to legal type - iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); - // dst = iq + jq; SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); @@ -1651,11 +1291,19 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); - SDValue Res[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Res, DL); + // Truncate to number of bits this divide really is. + if (Sign) { + SDValue InRegSize + = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits)); + Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize); + Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize); + } else { + SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT); + Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask); + Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask); + } + + return DAG.getMergeValues({ Div, Rem }, DL); } void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, @@ -1686,10 +1334,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT), LHS_Lo, RHS_Lo); - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(0), zero); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, Res.getValue(1), zero); - Results.push_back(DIV); - Results.push_back(REM); + SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero}); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero}); + + Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV)); + Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM)); return; } @@ -1698,7 +1347,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo); SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ); - SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, zero); + SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero}); + REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM); SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ); SDValue DIV_Lo = zero; @@ -1718,7 +1368,7 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, // Add LHS high bit REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit); - SDValue BIT = DAG.getConstant(1 << bitPos, DL, HalfVT); + SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT); SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE); DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT); @@ -1728,7 +1378,8 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE); } - SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi); + SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi}); + DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV); Results.push_back(DIV); Results.push_back(REM); } @@ -1744,19 +1395,14 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return DAG.getMergeValues(Results, DL); } - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - if (VT == MVT::i32) { - if (DAG.MaskedValueIsZero(Num, APInt::getHighBitsSet(32, 8)) && - DAG.MaskedValueIsZero(Den, APInt::getHighBitsSet(32, 8))) { - // TODO: We technically could do this for i64, but shouldn't that just be - // handled by something generally reducing 64-bit division on 32-bit - // values to 32-bit? - return LowerDIVREM24(Op, DAG, false); - } + if (SDValue Res = LowerDIVREM24(Op, DAG, false)) + return Res; } + SDValue Num = Op.getOperand(0); + SDValue Den = Op.getOperand(1); + // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); @@ -1864,11 +1510,11 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, SDValue Zero = DAG.getConstant(0, DL, VT); SDValue NegOne = DAG.getConstant(-1, DL, VT); - if (VT == MVT::i32 && - DAG.ComputeNumSignBits(LHS) > 8 && - DAG.ComputeNumSignBits(RHS) > 8) { - return LowerDIVREM24(Op, DAG, true); + if (VT == MVT::i32) { + if (SDValue Res = LowerDIVREM24(Op, DAG, true)) + return Res; } + if (VT == MVT::i64 && DAG.ComputeNumSignBits(LHS) > 32 && DAG.ComputeNumSignBits(RHS) > 32) { @@ -1954,7 +1600,8 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } -static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { +static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, + SelectionDAG &DAG) { const unsigned FractBits = 52; const unsigned ExpBits = 11; @@ -1992,8 +1639,7 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask); // Extend back to to 64-bits. - SDValue SignBit64 = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - Zero, SignBit); + SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit}); SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64); SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src); @@ -2391,7 +2037,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, MVT::i32, FloorMul); SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma); - SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Lo, Hi); + SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}); return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result); } @@ -2437,7 +2083,7 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, for (unsigned I = 0; I < NElts; ++I) Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp); - return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args); + return DAG.getBuildVector(VT, DL, Args); } //===----------------------------------------------------------------------===// @@ -2476,8 +2122,8 @@ static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) { } template <typename IntTy> -static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, - uint32_t Offset, uint32_t Width, SDLoc DL) { +static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, + uint32_t Width, const SDLoc &DL) { if (Width + Offset < 32) { uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width); IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width); @@ -2487,55 +2133,175 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, return DAG.getConstant(Src0 >> Offset, DL, MVT::i32); } -static bool usesAllNormalStores(SDNode *LoadVal) { - for (SDNode::use_iterator I = LoadVal->use_begin(); !I.atEnd(); ++I) { - if (!ISD::isNormalStore(*I)) - return false; +static bool hasVolatileUser(SDNode *Val) { + for (SDNode *U : Val->uses()) { + if (MemSDNode *M = dyn_cast<MemSDNode>(U)) { + if (M->isVolatile()) + return true; + } } + return false; +} + +bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const { + // i32 vectors are the canonical memory type. + if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT)) + return false; + + if (!VT.isByteSized()) + return false; + + unsigned Size = VT.getStoreSize(); + + if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector()) + return false; + + if (Size == 3 || (Size > 4 && (Size % 4 != 0))) + return false; + return true; } -// If we have a copy of an illegal type, replace it with a load / store of an -// equivalently sized legal type. This avoids intermediate bit pack / unpack -// instructions emitted when handling extloads and truncstores. Ideally we could -// recognize the pack / unpack pattern to eliminate it. +// Replace load of an illegal type with a store of a bitcast to a friendlier +// type. +SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + LoadSDNode *LN = cast<LoadSDNode>(N); + if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) + return SDValue(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + EVT VT = LN->getMemoryVT(); + + unsigned Size = VT.getStoreSize(); + unsigned Align = LN->getAlignment(); + if (Align < Size && isTypeLegal(VT)) { + bool IsFast; + unsigned AS = LN->getAddressSpace(); + + // Expand unaligned loads earlier than legalization. Due to visitation order + // problems during legalization, the emitted instructions to pack and unpack + // the bytes again are not eliminated in the case of an unaligned copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG); + return DAG.getMergeValues(Ops, SDLoc(N)); + } + + if (!IsFast) + return SDValue(); + } + + if (!shouldCombineMemoryType(VT)) + return SDValue(); + + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + + SDValue NewLoad + = DAG.getLoad(NewVT, SL, LN->getChain(), + LN->getBasePtr(), LN->getMemOperand()); + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad); + DCI.CombineTo(N, BC, NewLoad.getValue(1)); + return SDValue(N, 0); +} + +// Replace store of an illegal type with a store of a bitcast to a friendlier +// type. SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (!DCI.isBeforeLegalize()) return SDValue(); StoreSDNode *SN = cast<StoreSDNode>(N); - SDValue Value = SN->getValue(); - EVT VT = Value.getValueType(); + if (SN->isVolatile() || !ISD::isNormalStore(SN)) + return SDValue(); - if (isTypeLegal(VT) || SN->isVolatile() || - !ISD::isNormalLoad(Value.getNode()) || VT.getSizeInBits() < 8) + EVT VT = SN->getMemoryVT(); + unsigned Size = VT.getStoreSize(); + + SDLoc SL(N); + SelectionDAG &DAG = DCI.DAG; + unsigned Align = SN->getAlignment(); + if (Align < Size && isTypeLegal(VT)) { + bool IsFast; + unsigned AS = SN->getAddressSpace(); + + // Expand unaligned stores earlier than legalization. Due to visitation + // order problems during legalization, the emitted instructions to pack and + // unpack the bytes again are not eliminated in the case of an unaligned + // copy. + if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) + return expandUnalignedStore(SN, DAG); + + if (!IsFast) + return SDValue(); + } + + if (!shouldCombineMemoryType(VT)) + return SDValue(); + + EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT); + SDValue Val = SN->getValue(); + + //DCI.AddToWorklist(Val.getNode()); + + bool OtherUses = !Val.hasOneUse(); + SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val); + if (OtherUses) { + SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal); + DAG.ReplaceAllUsesOfValueWith(Val, CastBack); + } + + return DAG.getStore(SN->getChain(), SL, CastVal, + SN->getBasePtr(), SN->getMemOperand()); +} + +// TODO: Should repeat for other bit ops. +SDValue AMDGPUTargetLowering::performAndCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) return SDValue(); - LoadSDNode *LoadVal = cast<LoadSDNode>(Value); - if (LoadVal->isVolatile() || !usesAllNormalStores(LoadVal)) + // Break up 64-bit and of a constant into two 32-bit ands. This will typically + // happen anyway for a VALU 64-bit and. This exposes other 32-bit integer + // combine opportunities since most 64-bit operations are decomposed this way. + // TODO: We won't want this for SALU especially if it is an inline immediate. + const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHS) return SDValue(); - EVT MemVT = LoadVal->getMemoryVT(); + uint64_t Val = RHS->getZExtValue(); + if (Lo_32(Val) != 0 && Hi_32(Val) != 0 && !RHS->hasOneUse()) { + // If either half of the constant is 0, this is really a 32-bit and, so + // split it. If we can re-use the full materialized constant, keep it. + return SDValue(); + } SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), MemVT); - SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, - LoadVT, SL, - LoadVal->getChain(), - LoadVal->getBasePtr(), - LoadVal->getOffset(), - LoadVT, - LoadVal->getMemOperand()); + SDValue Lo, Hi; + std::tie(Lo, Hi) = split64BitValue(N->getOperand(0), DAG); - SDValue CastLoad = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad.getValue(0)); - DCI.CombineTo(LoadVal, CastLoad, NewLoad.getValue(1), false); + SDValue LoRHS = DAG.getConstant(Lo_32(Val), SL, MVT::i32); + SDValue HiRHS = DAG.getConstant(Hi_32(Val), SL, MVT::i32); - return DAG.getStore(SN->getChain(), SL, NewLoad, - SN->getBasePtr(), SN->getMemOperand()); + SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, LoRHS); + SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, HiRHS); + + // Re-visit the ands. It's possible we eliminated one of them and it could + // simplify the vector. + DCI.AddToWorklist(Lo.getNode()); + DCI.AddToWorklist(Hi.getNode()); + + SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); } SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, @@ -2543,14 +2309,17 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, if (N->getValueType(0) != MVT::i64) return SDValue(); - // i64 (shl x, 32) -> (build_pair 0, x) + // i64 (shl x, C) -> (build_pair 0, (shl x, C -32)) - // Doing this with moves theoretically helps MI optimizations that understand - // copies. 2 v_mov_b32_e32 will have the same code size / cycle count as - // v_lshl_b64. In the SALU case, I think this is slightly worse since it - // doubles the code size and I'm unsure about cycle count. + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); - if (!RHS || RHS->getZExtValue() != 32) + if (!RHS) + return SDValue(); + + unsigned RHSVal = RHS->getZExtValue(); + if (RHSVal < 32) return SDValue(); SDValue LHS = N->getOperand(0); @@ -2558,11 +2327,85 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - // Extract low 32-bits. + SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32); + SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt); const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo); + + SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); +} + +SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHS) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + unsigned RHSVal = RHS->getZExtValue(); + + // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31) + if (RHSVal == 32) { + SDValue Hi = getHiHalf64(N->getOperand(0), DAG); + SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(31, SL, MVT::i32)); + + SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); + } + + // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31) + if (RHSVal == 63) { + SDValue Hi = getHiHalf64(N->getOperand(0), DAG); + SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi, + DAG.getConstant(31, SL, MVT::i32)); + SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift}); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec); + } + + return SDValue(); +} + +SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + if (N->getValueType(0) != MVT::i64) + return SDValue(); + + const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHS) + return SDValue(); + + unsigned ShiftAmt = RHS->getZExtValue(); + if (ShiftAmt < 32) + return SDValue(); + + // srl i64:x, C for C >= 32 + // => + // build_pair (srl hi_32(x), C - 32), 0 + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + + SDValue One = DAG.getConstant(1, SL, MVT::i32); + SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + + SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, + VecOp, One); + + SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32); + SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst); + + SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero}); + + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair); } SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, @@ -2610,8 +2453,8 @@ static bool isCtlzOpc(unsigned Opc) { // type VT. // Need to match pre-legalized type because the generic legalization inserts the // add/sub between the select and compare. -static SDValue getFFBH_U32(const TargetLowering &TLI, - SelectionDAG &DAG, SDLoc SL, SDValue Op) { +static SDValue getFFBH_U32(const TargetLowering &TLI, SelectionDAG &DAG, + const SDLoc &SL, SDValue Op) { EVT VT = Op.getValueType(); EVT LegalVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); if (LegalVT != MVT::i32) @@ -2634,10 +2477,8 @@ static SDValue getFFBH_U32(const TargetLowering &TLI, // against the bitwidth. // // TODO: Should probably combine against FFBH_U32 instead of ctlz directly. -SDValue AMDGPUTargetLowering::performCtlzCombine(SDLoc SL, - SDValue Cond, - SDValue LHS, - SDValue RHS, +SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond, + SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const { ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); if (!CmpRhs || !CmpRhs->isNullValue()) @@ -2680,8 +2521,13 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, SDValue True = N->getOperand(1); SDValue False = N->getOperand(2); - if (VT == MVT::f32 && Cond.hasOneUse()) - return CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + if (VT == MVT::f32 && Cond.hasOneUse()) { + SDValue MinMax + = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI); + // Revisit this node so we can catch min3/max3/med3 patterns. + //DCI.AddToWorklist(MinMax.getNode()); + return MinMax; + } // There's no reason to not do this if the condition has other uses. return performCtlzCombine(SDLoc(N), Cond, True, False, DCI); @@ -2695,12 +2541,62 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, switch(N->getOpcode()) { default: break; + case ISD::BITCAST: { + EVT DestVT = N->getValueType(0); + if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) + break; + + // Fold bitcasts of constants. + // + // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k) + // TODO: Generalize and move to DAGCombiner + SDValue Src = N->getOperand(0); + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { + assert(Src.getValueType() == MVT::i64); + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + } + + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { + const APInt &Val = C->getValueAPF().bitcastToAPInt(); + SDLoc SL(N); + uint64_t CVal = Val.getZExtValue(); + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + + return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec); + } + + break; + } case ISD::SHL: { if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) break; return performShlCombine(N, DCI); } + case ISD::SRL: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performSrlCombine(N, DCI); + } + case ISD::SRA: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performSraCombine(N, DCI); + } + case ISD::AND: { + if (DCI.getDAGCombineLevel() < AfterLegalizeDAG) + break; + + return performAndCombine(N, DCI); + } case ISD::MUL: return performMulCombine(N, DCI); case AMDGPUISD::MUL_I24: @@ -2797,7 +2693,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; } - + case ISD::LOAD: + return performLoadCombine(N, DCI); case ISD::STORE: return performStoreCombine(N, DCI); } @@ -2840,20 +2737,6 @@ void AMDGPUTargetLowering::getOriginalFunctionArgs( } } -bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { - return CFP->isExactlyValue(1.0); - } - return isAllOnesConstant(Op); -} - -bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { - if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { - return CFP->getValueAPF().isZero(); - } - return isNullConstant(Op); -} - SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2889,10 +2772,11 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { // AMDIL DAG nodes NODE_NAME_CASE(CALL); NODE_NAME_CASE(UMUL); - NODE_NAME_CASE(RET_FLAG); NODE_NAME_CASE(BRANCH_COND); // AMDGPU DAG nodes + NODE_NAME_CASE(ENDPGM) + NODE_NAME_CASE(RETURN) NODE_NAME_CASE(DWORDADDR) NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) @@ -2906,6 +2790,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN3) NODE_NAME_CASE(SMIN3) NODE_NAME_CASE(UMIN3) + NODE_NAME_CASE(FMED3) + NODE_NAME_CASE(SMED3) + NODE_NAME_CASE(UMED3) NODE_NAME_CASE(URECIP) NODE_NAME_CASE(DIV_SCALE) NODE_NAME_CASE(DIV_FMAS) @@ -2914,7 +2801,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RSQ_LEGACY) - NODE_NAME_CASE(RSQ_CLAMPED) + NODE_NAME_CASE(RSQ_CLAMP) NODE_NAME_CASE(LDEXP) NODE_NAME_CASE(FP_CLASS) NODE_NAME_CASE(DOT4) @@ -2934,7 +2821,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) NODE_NAME_CASE(REGISTER_STORE) - NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(LOAD_INPUT) NODE_NAME_CASE(SAMPLE) NODE_NAME_CASE(SAMPLEB) @@ -2946,13 +2832,18 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE3) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) + NODE_NAME_CASE(PC_ADD_REL_OFFSET) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(INTERP_MOV) NODE_NAME_CASE(INTERP_P1) NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) + NODE_NAME_CASE(LOAD_CONSTANT) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(ATOMIC_CMP_SWAP) + NODE_NAME_CASE(ATOMIC_INC) + NODE_NAME_CASE(ATOMIC_DEC) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; @@ -2998,21 +2889,6 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, return SDValue(); } -static void computeKnownBitsForMinMax(const SDValue Op0, - const SDValue Op1, - APInt &KnownZero, - APInt &KnownOne, - const SelectionDAG &DAG, - unsigned Depth) { - APInt Op0Zero, Op0One; - APInt Op1Zero, Op1One; - DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth); - DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth); - - KnownZero = Op0Zero & Op1Zero; - KnownOne = Op0One & Op1One; -} - void AMDGPUTargetLowering::computeKnownBitsForTargetNode( const SDValue Op, APInt &KnownZero, @@ -3029,22 +2905,6 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( switch (Opc) { default: break; - case ISD::INTRINSIC_WO_CHAIN: { - // FIXME: The intrinsic should just use the node. - switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) { - case AMDGPUIntrinsic::AMDGPU_imax: - case AMDGPUIntrinsic::AMDGPU_umax: - case AMDGPUIntrinsic::AMDGPU_imin: - case AMDGPUIntrinsic::AMDGPU_umin: - computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2), - KnownZero, KnownOne, DAG, Depth); - break; - default: - break; - } - - break; - } case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: { KnownZero = APInt::getHighBitsSet(32, 31); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 37925416a9c4..c2c758592d1c 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUISELLOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H #include "llvm/Target/TargetLowering.h" @@ -28,12 +28,10 @@ class AMDGPUTargetLowering : public TargetLowering { protected: const AMDGPUSubtarget *Subtarget; -private: SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV, const SDValue &InitPtr, SDValue Chain, SelectionDAG &DAG) const; - SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; @@ -67,42 +65,43 @@ private: SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; +protected: + bool shouldCombineMemoryType(EVT VT) const; + SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performCtlzCombine(SDLoc SL, SDValue Cond, SDValue LHS, SDValue RHS, - DAGCombinerInfo &DCI) const; + SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, + SDValue RHS, DAGCombinerInfo &DCI) const; SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; -protected: static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); - static EVT getEquivalentLoadRegType(LLVMContext &Context, EVT VT); + static EVT getEquivalentBitType(LLVMContext &Context, EVT VT); virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector load into a scalar load of each component. - SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const; + /// Return 64-bit value Op as two 32-bit integers. + std::pair<SDValue, SDValue> split64BitValue(SDValue Op, + SelectionDAG &DAG) const; + SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; + SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; /// \brief Split a vector load into 2 loads of half the vector. SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into a scalar store of each component. - SDValue ScalarizeVectorStore(SDValue Op, SelectionDAG &DAG) const; - /// \brief Split a vector store into 2 stores of half the vector. SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl<SDValue> &Results) const; - bool isHWTrueValue(SDValue Op) const; - bool isHWFalseValue(SDValue Op) const; - /// The SelectionDAGBuilder will automatically promote function arguments /// with illegal types. However, this does not work for the AMDGPU targets /// since the function arguments are stored in memory as these illegal types. @@ -119,7 +118,7 @@ protected: const SmallVectorImpl<ISD::OutputArg> &Outs) const; public: - AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); + AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; @@ -141,7 +140,7 @@ public: ISD::LoadExtType ExtType, EVT ExtVT) const override; - bool isLoadBitCastBeneficial(EVT, EVT) const override; + bool isLoadBitCastBeneficial(EVT, EVT) const final; bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, @@ -150,11 +149,10 @@ public: bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const override; + const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, + SelectionDAG &DAG) const override; SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; @@ -167,16 +165,9 @@ public: SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; - SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineFMinMaxLegacy(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - DAGCombinerInfo &DCI) const; + SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, + SDValue RHS, SDValue True, SDValue False, + SDValue CC, DAGCombinerInfo &DCI) const; const char* getTargetNodeName(unsigned Opcode) const override; @@ -189,9 +180,7 @@ public: unsigned &RefinementSteps) const override; virtual SDNode *PostISelFolding(MachineSDNode *N, - SelectionDAG &DAG) const { - return N; - } + SelectionDAG &DAG) const = 0; /// \brief Determine which of the bits specified in \p Mask are known to be /// either zero or one and return them in the \p KnownZero and \p KnownOne @@ -214,8 +203,9 @@ public: unsigned Reg, EVT VT) const; enum ImplicitParameter { - GRID_DIM, - GRID_OFFSET + FIRST_IMPLICIT, + GRID_DIM = FIRST_IMPLICIT, + GRID_OFFSET, }; /// \brief Helper function that returns the byte offset of the given @@ -231,9 +221,10 @@ enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, CALL, // Function call based on a single integer UMUL, // 32bit unsigned multiplication - RET_FLAG, BRANCH_COND, // End AMDIL ISD Opcodes + ENDPGM, + RETURN, DWORDADDR, FRACT, CLAMP, @@ -250,6 +241,9 @@ enum NodeType : unsigned { FMIN3, SMIN3, UMIN3, + FMED3, + SMED3, + UMED3, URECIP, DIV_SCALE, DIV_FMAS, @@ -261,7 +255,7 @@ enum NodeType : unsigned { RCP, RSQ, RSQ_LEGACY, - RSQ_CLAMPED, + RSQ_CLAMP, LDEXP, FP_CLASS, DOT4, @@ -307,10 +301,14 @@ enum NodeType : unsigned { INTERP_MOV, INTERP_P1, INTERP_P2, + PC_ADD_REL_OFFSET, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + ATOMIC_CMP_SWAP, + ATOMIC_INC, + ATOMIC_DEC, LAST_AMDGPU_ISD_NUMBER }; diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index a266e711af5b..9a00ecb24ebe 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -30,163 +30,8 @@ using namespace llvm; // Pin the vtable to this file. void AMDGPUInstrInfo::anchor() {} -AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUGenInstrInfo(-1, -1), ST(st) {} - -const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const { - return RI; -} - -bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SubIdx) const { -// TODO: Implement this function - return false; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} - -bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const { -// TODO: Implement this function - return 0; -} -bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const { -// TODO: Implement this function - return false; -} - -MachineInstr * -AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { -// TODO: Implement this function - return nullptr; -} - -void -AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, - int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -void -AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - llvm_unreachable("Not Implemented"); -} - -bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const { - MachineBasicBlock *MBB = MI->getParent(); - int OffsetOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::addr); - // addr is a custom operand with multiple MI operands, and only the - // first MI operand is given a name. - int RegOpIdx = OffsetOpIdx + 1; - int ChanOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::chan); - if (isRegisterLoad(*MI)) { - int DstOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::dst); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - getIndirectAddrRegClass()->getRegister(Address)); - } else { - buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(), - Address, OffsetReg); - } - } else if (isRegisterStore(*MI)) { - int ValOpIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::val); - unsigned RegIndex = MI->getOperand(RegOpIdx).getImm(); - unsigned Channel = MI->getOperand(ChanOpIdx).getImm(); - unsigned Address = calculateIndirectAddress(RegIndex, Channel); - unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { - buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), - MI->getOperand(ValOpIdx).getReg()); - } else { - buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(), - calculateIndirectAddress(RegIndex, Channel), - OffsetReg); - } - } else { - return false; - } - - MBB->erase(MI); - return true; -} - -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { -// TODO: Implement this function - return nullptr; -} -MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { - // TODO: Implement this function - return nullptr; -} -bool -AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, - bool UnfoldStore, - SmallVectorImpl<MachineInstr*> &NewMIs) const { - // TODO: Implement this function - return false; -} - -bool -AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl<SDNode*> &NewNodes) const { - // TODO: Implement this function - return false; -} - -unsigned -AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex) const { - // TODO: Implement this function - return 0; -} +AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) + : AMDGPUGenInstrInfo(-1, -1), ST(ST) {} bool AMDGPUInstrInfo::enableClusterLoads() const { return true; @@ -214,106 +59,6 @@ bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, return (NumLoads <= 16 && (Offset1 - Offset0) < 64); } -bool -AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) - const { - // TODO: Implement this function - return true; -} -void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const { - // TODO: Implement this function -} - -bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const { - // TODO: Implement this function - return false; -} - -bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const { - // TODO: Implement this function - return MI->getDesc().isPredicable(); -} - -bool -AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - // TODO: Implement this function - return true; -} - -bool AMDGPUInstrInfo::isRegisterStore(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; -} - -bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; -} - -int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - int Offset = -1; - - if (MFI->getNumObjects() == 0) { - return -1; - } - - if (MRI.livein_empty()) { - return 0; - } - - const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); - for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), - LE = MRI.livein_end(); - LI != LE; ++LI) { - unsigned Reg = LI->first; - if (TargetRegisterInfo::isVirtualRegister(Reg) || - !IndirectRC->contains(Reg)) - continue; - - unsigned RegIndex; - unsigned RegEnd; - for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; - ++RegIndex) { - if (IndirectRC->getRegister(RegIndex) == Reg) - break; - } - Offset = std::max(Offset, (int)RegIndex); - } - - return Offset + 1; -} - -int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { - int Offset = 0; - const MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Variable sized objects are not supported - assert(!MFI->hasVarSizedObjects()); - - if (MFI->getNumObjects() == 0) { - return -1; - } - - unsigned IgnoredFrameReg; - Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference( - MF, -1, IgnoredFrameReg); - - return getIndirectIndexBegin(MF) + Offset; -} - int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { switch (Channels) { default: return Opcode; @@ -323,35 +68,44 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const { } } +// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td +enum SIEncodingFamily { + SI = 0, + VI = 1 +}; + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. namespace llvm { namespace AMDGPU { static int getMCOpcode(uint16_t Opcode, unsigned Gen) { - return getMCOpcodeGen(Opcode, (enum Subtarget)Gen); + return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen)); } } } -// This must be kept in sync with the SISubtarget class in SIInstrInfo.td -enum SISubtarget { - SI = 0, - VI = 1 -}; - -static enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) { - switch (Gen) { - default: - return SI; +static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { + switch (ST.getGeneration()) { + case AMDGPUSubtarget::SOUTHERN_ISLANDS: + case AMDGPUSubtarget::SEA_ISLANDS: + return SIEncodingFamily::SI; case AMDGPUSubtarget::VOLCANIC_ISLANDS: - return VI; + return SIEncodingFamily::VI; + + // FIXME: This should never be called for r600 GPUs. + case AMDGPUSubtarget::R600: + case AMDGPUSubtarget::R700: + case AMDGPUSubtarget::EVERGREEN: + case AMDGPUSubtarget::NORTHERN_ISLANDS: + return SIEncodingFamily::SI; } + + llvm_unreachable("Unknown subtarget generation!"); } int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - int MCOp = AMDGPU::getMCOpcode( - Opcode, AMDGPUSubtargetToSISubtarget(ST.getGeneration())); + int MCOp = AMDGPU::getMCOpcode(Opcode, subtargetEncodingFamily(ST)); // -1 means that Opcode is already a native instruction. if (MCOp == -1) @@ -364,14 +118,3 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } - -ArrayRef<std::pair<int, const char *>> -AMDGPUInstrInfo::getSerializableTargetIndices() const { - static const std::pair<int, const char *> TargetIndices[] = { - {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, - {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; - return makeArrayRef(TargetIndices); -} diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 53e8b23b3d62..a59eafadeb93 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -13,12 +13,10 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINSTRINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H -#include "AMDGPURegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" -#include <map> #define GET_INSTRINFO_HEADER #define GET_INSTRINFO_ENUM @@ -39,78 +37,12 @@ class MachineInstrBuilder; class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { private: - const AMDGPURegisterInfo RI; - virtual void anchor(); -protected: const AMDGPUSubtarget &ST; -public: - explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); - virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; - - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; - - unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const override; - unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const override; - bool hasLoadFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const override; - unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; - unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, - int &FrameIndex) const; - bool hasStoreFromStackSlot(const MachineInstr *MI, - const MachineMemOperand *&MMO, - int &FrameIndex) const; - - MachineInstr * - convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const override; - - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - -protected: - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, - MachineInstr *LoadMI) const override; + virtual void anchor(); public: - /// \returns the smallest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexBegin(const MachineFunction &MF) const; - - /// \returns the largest register index that will be accessed by an indirect - /// read or write or -1 if indirect addressing is not used by this program. - int getIndirectIndexEnd(const MachineFunction &MF) const; - - bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, bool UnfoldStore, - SmallVectorImpl<MachineInstr *> &NewMIs) const override; - bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl<SDNode *> &NewNodes) const override; - unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex = nullptr) const override; + explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); bool enableClusterLoads() const override; @@ -118,81 +50,14 @@ public: int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; - bool - ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; - void insertNoop(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI) const override; - bool isPredicated(const MachineInstr *MI) const override; - bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const override; - bool DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const override; - bool isPredicable(MachineInstr *MI) const override; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - - // Helper functions that check the opcode for status information - bool isRegisterStore(const MachineInstr &MI) const; - bool isRegisterLoad(const MachineInstr &MI) const; - /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. int pseudoToMCOpcode(int Opcode) const; - /// \brief Return the descriptor of the target-specific machine instruction - /// that corresponds to the specified pseudo or native opcode. - const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { - return get(pseudoToMCOpcode(Opcode)); - } - - ArrayRef<std::pair<int, const char *>> - getSerializableTargetIndices() const override; - -//===---------------------------------------------------------------------===// -// Pure virtual funtions to be implemented by sub-classes. -//===---------------------------------------------------------------------===// - - virtual bool isMov(unsigned opcode) const = 0; - - /// \brief Calculate the "Indirect Address" for the given \p RegIndex and - /// \p Channel - /// - /// We model indirect addressing using a virtual address space that can be - /// accesed with loads and stores. The "Indirect Address" is the memory - /// address in this virtual address space that maps to the given \p RegIndex - /// and \p Channel. - virtual unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const = 0; - - /// \returns The register class to be used for loading and storing values - /// from an "Indirect Address" . - virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0; - - /// \brief Build instruction(s) for an indirect register write. - /// - /// \returns The instruction that performs the indirect register write - virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build instruction(s) for an indirect register read. - /// - /// \returns The instruction that performs the indirect register read - virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const = 0; - - /// \brief Build a MOV instruction. - virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const = 0; - /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the /// equivalent opcode that writes \p Channels Channels. int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const; - }; namespace AMDGPU { diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index 575dfe413658..2b13bb9079ea 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -44,6 +44,11 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4, // AMDGPU DAG Nodes // +def AMDGPUconstdata_ptr : SDNode< + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>, + SDTCisVT<0, iPTR>]> +>; + // This argument to this node is a dword address. def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; @@ -63,7 +68,7 @@ def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>; // out = 1.0 / sqrt(a) result clamped to +/- max_float. -def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>; +def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>; def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; @@ -183,6 +188,11 @@ def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; @@ -209,6 +219,16 @@ def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp, [] >; +def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, + [] +>; + +def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; @@ -241,5 +261,8 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai //===----------------------------------------------------------------------===// // Call/Return DAG Nodes //===----------------------------------------------------------------------===// -def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, +def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td index 2a7ce6a47176..6761b4b5df95 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -12,7 +12,8 @@ // //===----------------------------------------------------------------------===// -class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction { +class AMDGPUInst <dag outs, dag ins, string asm = "", + list<dag> pattern = []> : Instruction { field bit isRegisterLoad = 0; field bit isRegisterStore = 0; @@ -23,15 +24,22 @@ class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instructio let Pattern = pattern; let Itinerary = NullALU; + // SoftFail is a field the disassembler can use to provide a way for + // instructions to not match without killing the whole decode process. It is + // mainly used for ARM, but Tablegen expects this field to exist or it fails + // to build the decode table. + field bits<64> SoftFail = 0; + + let DecoderNamespace = Namespace; + let TSFlags{63} = isRegisterLoad; let TSFlags{62} = isRegisterStore; } -class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern> - : AMDGPUInst<outs, ins, asm, pattern> { +class AMDGPUShaderInst <dag outs, dag ins, string asm = "", + list<dag> pattern = []> : AMDGPUInst<outs, ins, asm, pattern> { field bits<32> Inst = 0xffffffff; - } def FP32Denormals : Predicate<"Subtarget.hasFP32Denormals()">; @@ -41,6 +49,13 @@ def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">; def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>; def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>; +// 32-bit VALU immediate operand that uses the constant bus. +def u32kimm : Operand<i32> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_KIMM32"; + let PrintMethod = "printU32ImmOperand"; +} + let OperandType = "OPERAND_IMMEDIATE" in { def u32imm : Operand<i32> { @@ -146,6 +161,17 @@ def COND_NULL : PatLeaf < [{(void)N; return false;}] >; + +//===----------------------------------------------------------------------===// +// Misc. PatFrags +//===----------------------------------------------------------------------===// + +class HasOneUseBinOp<SDPatternOperator op> : PatFrag< + (ops node:$src0, node:$src1), + (op $src0, $src1), + [{ return N->hasOneUse(); }] +>; + //===----------------------------------------------------------------------===// // Load/Store Pattern Fragments //===----------------------------------------------------------------------===// @@ -168,21 +194,58 @@ def truncstorei8_private : PrivateStore <truncstorei8>; def truncstorei16_private : PrivateStore <truncstorei16>; def store_private : PrivateStore <store>; -def global_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast<StoreSDNode>(N)); +class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; // Global address space loads -def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +class GlobalLoad <SDPatternOperator op> : GlobalMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +def global_load : GlobalLoad <load>; + +// Global address space stores +class GlobalStore <SDPatternOperator op> : GlobalMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +def global_store : GlobalStore <store>; +def global_store_atomic : GlobalStore<atomic_store>; + + +class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; }]>; // Constant address space loads -def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +class ConstantLoad <SDPatternOperator op> : ConstantMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +def constant_load : ConstantLoad<load>; + +class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +}]>; + +// Local address space loads +class LocalLoad <SDPatternOperator op> : LocalMemOp < + (ops node:$ptr), (op node:$ptr) +>; + +class LocalStore <SDPatternOperator op> : LocalMemOp < + (ops node:$value, node:$ptr), (op node:$value, node:$ptr) +>; + +class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{ + return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS; }]>; +class FlatLoad <SDPatternOperator op> : FlatMemOp < + (ops node:$ptr), (op node:$ptr) +>; + class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr), (ld_node node:$ptr), [{ LoadSDNode *L = cast<LoadSDNode>(N); @@ -196,29 +259,14 @@ def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8; }]>; -def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); -}]>; +def az_extloadi8_global : GlobalLoad <az_extloadi8>; +def sextloadi8_global : GlobalLoad <sextloadi8>; -def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); -}]>; +def az_extloadi8_constant : ConstantLoad <az_extloadi8>; +def sextloadi8_constant : ConstantLoad <sextloadi8>; -def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); -}]>; - -def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{ - return isLocalLoad(dyn_cast<LoadSDNode>(N)); -}]>; +def az_extloadi8_local : LocalLoad <az_extloadi8>; +def sextloadi8_local : LocalLoad <sextloadi8>; def extloadi8_private : PrivateLoad <az_extloadi8>; def sextloadi8_private : PrivateLoad <sextloadi8>; @@ -227,29 +275,14 @@ def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16; }]>; -def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); -}]>; - -def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); -}]>; - -def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); -}]>; +def az_extloadi16_global : GlobalLoad <az_extloadi16>; +def sextloadi16_global : GlobalLoad <sextloadi16>; -def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast<LoadSDNode>(N)); -}]>; +def az_extloadi16_constant : ConstantLoad <az_extloadi16>; +def sextloadi16_constant : ConstantLoad <sextloadi16>; -def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{ - return isLocalLoad(dyn_cast<LoadSDNode>(N)); -}]>; +def az_extloadi16_local : LocalLoad <az_extloadi16>; +def sextloadi16_local : LocalLoad <sextloadi16>; def extloadi16_private : PrivateLoad <az_extloadi16>; def sextloadi16_private : PrivateLoad <sextloadi16>; @@ -258,49 +291,20 @@ def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{ return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32; }]>; -def az_extloadi32_global : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isGlobalLoad(dyn_cast<LoadSDNode>(N)); -}]>; +def az_extloadi32_global : GlobalLoad <az_extloadi32>; -def az_extloadi32_flat : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)); -}]>; +def az_extloadi32_flat : FlatLoad <az_extloadi32>; -def az_extloadi32_constant : PatFrag<(ops node:$ptr), - (az_extloadi32 node:$ptr), [{ - return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); -}]>; +def az_extloadi32_constant : ConstantLoad <az_extloadi32>; -def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast<StoreSDNode>(N)); -}]>; - -def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isGlobalStore(dyn_cast<StoreSDNode>(N)); -}]>; +def truncstorei8_global : GlobalStore <truncstorei8>; +def truncstorei16_global : GlobalStore <truncstorei16>; -def local_store : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast<StoreSDNode>(N)); -}]>; +def local_store : LocalStore <store>; +def truncstorei8_local : LocalStore <truncstorei8>; +def truncstorei16_local : LocalStore <truncstorei16>; -def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei8 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast<StoreSDNode>(N)); -}]>; - -def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr), - (truncstorei16 node:$val, node:$ptr), [{ - return isLocalStore(dyn_cast<StoreSDNode>(N)); -}]>; - -def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return isLocalLoad(dyn_cast<LoadSDNode>(N)); -}]>; +def local_load : LocalLoad <load>; class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{ return cast<MemSDNode>(N)->getAlignment() % 8 == 0; @@ -370,6 +374,12 @@ class global_binary_atomic_op<SDNode atomic_op> : PatFrag< [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}] >; +class flat_binary_atomic_op<SDNode atomic_op> : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] +>; + def atomic_swap_global : global_binary_atomic_op<atomic_swap>; def atomic_add_global : global_binary_atomic_op<atomic_load_add>; def atomic_and_global : global_binary_atomic_op<atomic_load_and>; @@ -381,6 +391,26 @@ def atomic_umax_global : global_binary_atomic_op<atomic_load_umax>; def atomic_umin_global : global_binary_atomic_op<atomic_load_umin>; def atomic_xor_global : global_binary_atomic_op<atomic_load_xor>; +def atomic_cmp_swap_global : global_binary_atomic_op<AMDGPUatomic_cmp_swap>; +def atomic_cmp_swap_global_nortn : PatFrag< + (ops node:$ptr, node:$value), + (atomic_cmp_swap_global node:$ptr, node:$value), + [{ return SDValue(N, 0).use_empty(); }] +>; + +def atomic_swap_flat : flat_binary_atomic_op<atomic_swap>; +def atomic_add_flat : flat_binary_atomic_op<atomic_load_add>; +def atomic_and_flat : flat_binary_atomic_op<atomic_load_and>; +def atomic_max_flat : flat_binary_atomic_op<atomic_load_max>; +def atomic_min_flat : flat_binary_atomic_op<atomic_load_min>; +def atomic_or_flat : flat_binary_atomic_op<atomic_load_or>; +def atomic_sub_flat : flat_binary_atomic_op<atomic_load_sub>; +def atomic_umax_flat : flat_binary_atomic_op<atomic_load_umax>; +def atomic_umin_flat : flat_binary_atomic_op<atomic_load_umin>; +def atomic_xor_flat : flat_binary_atomic_op<atomic_load_xor>; + +def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>; + //===----------------------------------------------------------------------===// // Misc Pattern Fragments //===----------------------------------------------------------------------===// @@ -392,6 +422,7 @@ int TWO_PI_INV = 0x3e22f983; int FP_UINT_MAX_PLUS_1 = 0x4f800000; // 1 << 32 in floating point encoding int FP32_NEG_ONE = 0xbf800000; int FP32_ONE = 0x3f800000; +int FP64_ONE = 0x3ff0000000000000; } def CONST : Constants; @@ -570,6 +601,25 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat < (BIT_ALIGN $src0, $src0, $src1) >; +// This matches 16 permutations of +// max(min(x, y), min(max(x, y), z)) +class IntMed3Pat<Instruction med3Inst, + SDPatternOperator max, + SDPatternOperator max_oneuse, + SDPatternOperator min_oneuse> : Pat< + (max (min_oneuse i32:$src0, i32:$src1), + (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), + (med3Inst $src0, $src1, $src2) +>; + +let Properties = [SDNPCommutative, SDNPAssociative] in { +def smax_oneuse : HasOneUseBinOp<smax>; +def smin_oneuse : HasOneUseBinOp<smin>; +def umax_oneuse : HasOneUseBinOp<umax>; +def umin_oneuse : HasOneUseBinOp<umin>; +} // Properties = [SDNPCommutative, SDNPAssociative] + + // 24-bit arithmetic patterns def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>; @@ -587,13 +637,6 @@ def cvt_flr_i32_f32 : PatFrag < [{ (void)N; return TM.Options.NoNaNsFPMath; }] >; -/* -class UMUL24Pattern <Instruction UMUL24> : Pat < - (mul U24:$x, U24:$y), - (UMUL24 $x, $y) ->; -*/ - class IMad24Pat<Instruction Inst> : Pat < (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2), (Inst $src0, $src1, $src2) @@ -604,30 +647,6 @@ class UMad24Pat<Instruction Inst> : Pat < (Inst $src0, $src1, $src2) >; -multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> { - def _expand_imad24 : Pat < - (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_imul24 : Pat < - (AMDGPUmul_i24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - -multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> { - def _expand_umad24 : Pat < - (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2), - (AddInst (MulInst $src0, $src1), $src2) - >; - - def _expand_umul24 : Pat < - (AMDGPUmul_u24 i32:$src0, i32:$src1), - (MulInst $src0, $src1) - >; -} - class RcpPat<Instruction RcpInst, ValueType vt> : Pat < (fdiv FP_ONE, vt:$src), (RcpInst $src) diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index e94bb6013d83..791872a9db40 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -20,46 +20,44 @@ using namespace llvm; -#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN -#include "AMDGPUGenIntrinsics.inc" -#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN - AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo() : TargetIntrinsicInfo() {} -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - static const char *const names[] = { +static const char *const IntrinsicNameTable[] = { #define GET_INTRINSIC_NAME_TABLE #include "AMDGPUGenIntrinsics.inc" #undef GET_INTRINSIC_NAME_TABLE - }; +}; +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned numTys) const { if (IntrID < Intrinsic::num_intrinsics) { return nullptr; } assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); - std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]); return Result; } -unsigned AMDGPUIntrinsicInfo::lookupName(const char *Name, +unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, unsigned Len) const { - if (!StringRef(Name, Len).startswith("llvm.")) + StringRef Name(NameData, Len); + if (!Name.startswith("llvm.")) return 0; // All intrinsics start with 'llvm.' -#define GET_FUNCTION_RECOGNIZER -#include "AMDGPUGenIntrinsics.inc" -#undef GET_FUNCTION_RECOGNIZER - AMDGPUIntrinsic::ID IntrinsicID = - (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; - IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name); - - if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { - return IntrinsicID; + // Look for a name match in our table. If the intrinsic is not overloaded, + // require an exact match. If it is overloaded, require a prefix match. The + // AMDGPU enum enum starts at Intrinsic::num_intrinsics. + int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name); + if (Idx >= 0) { + bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]); + return IsPrefixMatch == isOverloaded(Idx + 1) + ? Intrinsic::num_intrinsics + Idx + : 0; } + return 0; } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index 4c95b5ec0974..f4173929259c 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -11,8 +11,8 @@ /// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class. // //===-----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUINTRINSICINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINTRINSICINFO_H #include "llvm/IR/Intrinsics.h" #include "llvm/Target/TargetIntrinsicInfo.h" @@ -31,7 +31,7 @@ enum ID { } // end namespace AMDGPUIntrinsic -class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo { +class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(); std::string getName(unsigned IntrId, Type **Tys = nullptr, diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td index 1de3546485b1..2127391f18e7 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -12,79 +12,26 @@ //===----------------------------------------------------------------------===// let TargetPrefix = "AMDGPU", isTarget = 1 in { - - def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>; - def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_abs : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - // This is named backwards (instead of rsq_legacy) so we don't have - // to define it with the public builtins intrinsics. This is a - // workaround for how intrinsic names are parsed. If the name is - // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant - // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name. - def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - - def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; def int_AMDGPU_kilp : Intrinsic<[], [], []>; - def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte0 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte1 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte2 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_cvt_f32_ubyte3 : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + + // Deprecated in favor of separate int_amdgcn_cube* intrinsics. def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + + // Deprecated in favor of expanded bit operations def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; -} - -// Legacy names for compatibility. -let TargetPrefix = "AMDIL", isTarget = 1 in { - def int_AMDIL_abs : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_fraction : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_exp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - def int_AMDIL_round_nearest : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; -} -let TargetPrefix = "TGSI", isTarget = 1 in { + // Deprecated in favor of llvm.amdgcn.rsq + def int_AMDGPU_rsq : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + >; - def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>; + // Deprecated in favor of llvm.amdgcn.read.workdim + def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; } include "SIIntrinsics.td" diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index dfc652f31da5..ad8d3e4d3545 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -15,9 +15,9 @@ #include "AMDGPUMCInstLower.h" #include "AMDGPUAsmPrinter.h" +#include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "InstPrinter/AMDGPUInstPrinter.h" -#include "R600InstrInfo.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" @@ -37,8 +37,14 @@ using namespace llvm; AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st): - Ctx(ctx), ST(st) -{ } + Ctx(ctx), ST(st) { } + +static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { + switch (MOFlags) { + default: return MCSymbolRefExpr::VK_None; + case SIInstrInfo::MO_GOTPCREL: return MCSymbolRefExpr::VK_GOTPCREL; + } +} void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { @@ -70,11 +76,16 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(GV->getName())); - MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); + const MCExpr *SymExpr = + MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); + const MCExpr *Expr = MCBinaryExpr::createAdd(SymExpr, + MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); + MCOp = MCOperand::createExpr(Expr); break; } case MachineOperand::MO_ExternalSymbol: { MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); + Sym->setExternal(true); const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); MCOp = MCOperand::createExpr(Expr); break; @@ -88,13 +99,13 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { const AMDGPUSubtarget &STI = MF->getSubtarget<AMDGPUSubtarget>(); AMDGPUMCInstLower MCInstLowering(OutContext, STI); -#ifdef _DEBUG StringRef Err; - if (!STI.getInstrInfo()->verifyInstruction(MI, Err)) { - errs() << "Warning: Illegal instruction detected: " << Err << "\n"; + if (!STI.getInstrInfo()->verifyInstruction(*MI, Err)) { + LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext(); + C.emitError("Illegal instruction detected: " + Err); MI->dump(); } -#endif + if (MI->isBundle()) { const MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); @@ -103,6 +114,29 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { ++I; } } else { + // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder + // terminator instructions and should only be printed as comments. + if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + if (isVerbose()) { + SmallVector<char, 16> BBStr; + raw_svector_ostream Str(BBStr); + + const MachineBasicBlock *MBB = MI->getOperand(0).getMBB(); + const MCSymbolRefExpr *Expr + = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext); + Expr->print(Str, MAI); + OutStreamer->emitRawComment(" mask branch " + BBStr); + } + + return; + } + + if (MI->getOpcode() == AMDGPU::SI_RETURN) { + if (isVerbose()) + OutStreamer->emitRawComment(" return"); + return; + } + MCInst TmpInst; MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); @@ -114,10 +148,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { raw_string_ostream DisasmStream(DisasmLine); AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), - *MF->getSubtarget().getInstrInfo(), - *MF->getSubtarget().getRegisterInfo()); - InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), - MF->getSubtarget()); + *STI.getInstrInfo(), + *STI.getRegisterInfo()); + InstPrinter.printInst(&TmpInst, DisasmStream, StringRef(), STI); // Disassemble instruction/operands to hex representation. SmallVector<MCFixup, 4> Fixups; diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/lib/Target/AMDGPU/AMDGPUMCInstLower.h index d322fe072b2b..957dcd0de8ef 100644 --- a/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H -#define LLVM_LIB_TARGET_R600_AMDGPUMCINSTLOWER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H namespace llvm { diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 54137177e4c0..44516dab04f1 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,8 +1,5 @@ #include "AMDGPUMachineFunction.h" -#include "AMDGPU.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "llvm/IR/Attributes.h" -#include "llvm/IR/Function.h" + using namespace llvm; // Pin the vtable to this file. @@ -10,11 +7,17 @@ void AMDGPUMachineFunction::anchor() {} AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), - ShaderType(ShaderType::COMPUTE), + KernArgSize(0), + MaxKernArgAlign(0), LDSSize(0), ABIArgOffset(0), ScratchSize(0), - IsKernel(true) { + IsKernel(MF.getFunction()->getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL || + MF.getFunction()->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) +{ +} - ShaderType = AMDGPU::getShaderType(*MF.getFunction()); +bool AMDGPUMachineFunction::isKernel() const +{ + return IsKernel; } diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 46fcee874887..6b31f63e1a9d 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -1,4 +1,4 @@ -//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=// +//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -6,12 +6,9 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -/// \file -//===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H -#define LLVM_LIB_TARGET_R600_AMDGPUMACHINEFUNCTION_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H #include "llvm/CodeGen/MachineFunction.h" #include <map> @@ -19,11 +16,25 @@ namespace llvm { class AMDGPUMachineFunction : public MachineFunctionInfo { + uint64_t KernArgSize; + unsigned MaxKernArgAlign; + virtual void anchor(); - unsigned ShaderType; public: AMDGPUMachineFunction(const MachineFunction &MF); + + uint64_t allocateKernArg(uint64_t Size, unsigned Align) { + assert(isPowerOf2_32(Align)); + KernArgSize = alignTo(KernArgSize, Align); + + uint64_t Result = KernArgSize; + KernArgSize += Size; + + MaxKernArgAlign = std::max(Align, MaxKernArgAlign); + return Result; + } + /// A map to keep track of local memory objects and their offsets within /// the local memory space. std::map<const GlobalValue *, unsigned> LocalMemoryObjects; @@ -33,14 +44,7 @@ public: /// Start of implicit kernel args unsigned ABIArgOffset; - unsigned getShaderType() const { - return ShaderType; - } - - bool isKernel() const { - // FIXME: Assume everything is a kernel until function calls are supported. - return true; - } + bool isKernel() const; unsigned ScratchSize; bool IsKernel; diff --git a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp index 554bf1da81f5..8bc7b53435be 100644 --- a/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp +++ b/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -25,7 +25,6 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/Passes.h" diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 87d50d587059..775463809634 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -16,7 +16,8 @@ #include "AMDGPUSubtarget.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -26,79 +27,317 @@ using namespace llvm; namespace { -class AMDGPUPromoteAlloca : public FunctionPass, - public InstVisitor<AMDGPUPromoteAlloca> { - - static char ID; +// FIXME: This can create globals so should be a module pass. +class AMDGPUPromoteAlloca : public FunctionPass { +private: + const TargetMachine *TM; Module *Mod; - const AMDGPUSubtarget &ST; - int LocalMemAvailable; + const DataLayout *DL; + MDNode *MaxWorkGroupSizeRange; + + // FIXME: This should be per-kernel. + uint32_t LocalMemLimit; + uint32_t CurrentLocalMemUsage; + + bool IsAMDGCN; + bool IsAMDHSA; + + std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); + Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); + + /// BaseAlloca is the alloca root the search started from. + /// Val may be that alloca or a recursive user of it. + bool collectUsesWithPtrTypes(Value *BaseAlloca, + Value *Val, + std::vector<Value*> &WorkList) const; + + /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand + /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). + /// Returns true if both operands are derived from the same alloca. Val should + /// be the same value as one of the input operands of UseInst. + bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, + Instruction *UseInst, + int OpIdx0, int OpIdx1) const; public: - AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), - LocalMemAvailable(0) { } + static char ID; + + AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) : + FunctionPass(ID), + TM(TM_), + Mod(nullptr), + DL(nullptr), + MaxWorkGroupSizeRange(nullptr), + LocalMemLimit(0), + CurrentLocalMemUsage(0), + IsAMDGCN(false), + IsAMDHSA(false) { } + bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; - const char *getPassName() const override { return "AMDGPU Promote Alloca"; } - void visitAlloca(AllocaInst &I); + + const char *getPassName() const override { + return "AMDGPU Promote Alloca"; + } + + void handleAlloca(AllocaInst &I); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace char AMDGPUPromoteAlloca::ID = 0; +INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, + "AMDGPU promote alloca to vector or LDS", false, false) + +char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; + + bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + if (!TM) + return false; + Mod = &M; + DL = &Mod->getDataLayout(); + + // The maximum workitem id. + // + // FIXME: Should get as subtarget property. Usually runtime enforced max is + // 256. + MDBuilder MDB(Mod->getContext()); + MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048)); + + const Triple &TT = TM->getTargetTriple(); + + IsAMDGCN = TT.getArch() == Triple::amdgcn; + IsAMDHSA = TT.getOS() == Triple::AMDHSA; + return false; } bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + if (!TM || skipFunction(F)) + return false; - FunctionType *FTy = F.getFunctionType(); - - LocalMemAvailable = ST.getLocalMemorySize(); + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + if (!ST.isPromoteAllocaEnabled()) + return false; + FunctionType *FTy = F.getFunctionType(); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so // we cannot use local memory in the pass. - for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { - Type *ParamTy = FTy->getParamType(i); - if (ParamTy->isPointerTy() && - ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LocalMemAvailable = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " + for (Type *ParamTy : FTy->params()) { + PointerType *PtrTy = dyn_cast<PointerType>(ParamTy); + if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); - break; + return false; } } - if (LocalMemAvailable > 0) { - // Check how much local memory is being used by global objects - for (Module::global_iterator I = Mod->global_begin(), - E = Mod->global_end(); I != E; ++I) { - GlobalVariable *GV = &*I; - PointerType *GVTy = GV->getType(); - if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) + return false; + + const DataLayout &DL = Mod->getDataLayout(); + + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; + for (GlobalVariable &GV : Mod->globals()) { + if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast<Instruction>(U); + if (!Use) continue; - for (Value::use_iterator U = GV->use_begin(), - UE = GV->use_end(); U != UE; ++U) { - Instruction *Use = dyn_cast<Instruction>(*U); - if (!Use) - continue; - if (Use->getParent()->getParent() == &F) - LocalMemAvailable -= - Mod->getDataLayout().getTypeAllocSize(GVTy->getElementType()); + + if (Use->getParent()->getParent() == &F) { + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; + break; } } } - LocalMemAvailable = std::max(0, LocalMemAvailable); - DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage); - visit(F); + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. - return false; + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint + = AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0); + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); + + BasicBlock &EntryBB = *F.begin(); + for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { + AllocaInst *AI = dyn_cast<AllocaInst>(I); + + ++I; + if (AI) + handleAlloca(*AI); + } + + return true; +} + +std::pair<Value *, Value *> +AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { + if (!IsAMDHSA) { + Function *LocalSizeYFn + = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); + Function *LocalSizeZFn + = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); + + CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); + CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); + + LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + + return std::make_pair(LocalSizeY, LocalSizeZ); + } + + // We must read the size out of the dispatch pointer. + assert(IsAMDGCN); + + // We are indexing into this struct, and want to extract the workgroup_size_* + // fields. + // + // typedef struct hsa_kernel_dispatch_packet_s { + // uint16_t header; + // uint16_t setup; + // uint16_t workgroup_size_x ; + // uint16_t workgroup_size_y; + // uint16_t workgroup_size_z; + // uint16_t reserved0; + // uint32_t grid_size_x ; + // uint32_t grid_size_y ; + // uint32_t grid_size_z; + // + // uint32_t private_segment_size; + // uint32_t group_segment_size; + // uint64_t kernel_object; + // + // #ifdef HSA_LARGE_MODEL + // void *kernarg_address; + // #elif defined HSA_LITTLE_ENDIAN + // void *kernarg_address; + // uint32_t reserved1; + // #else + // uint32_t reserved1; + // void *kernarg_address; + // #endif + // uint64_t reserved2; + // hsa_signal_t completion_signal; // uint64_t wrapper + // } hsa_kernel_dispatch_packet_t + // + Function *DispatchPtrFn + = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); + + CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); + DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); + DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); + + // Size of the dispatch packet struct. + DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64); + + Type *I32Ty = Type::getInt32Ty(Mod->getContext()); + Value *CastDispatchPtr = Builder.CreateBitCast( + DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); + + // We could do a single 64-bit load here, but it's likely that the basic + // 32-bit and extract sequence is already present, and it is probably easier + // to CSE this. The loads should be mergable later anyway. + Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1); + LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4); + + Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); + LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); + + MDNode *MD = llvm::MDNode::get(Mod->getContext(), None); + LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); + LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); + LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + + // Extract y component. Upper half of LoadZU should be zero already. + Value *Y = Builder.CreateLShr(LoadXY, 16); + + return std::make_pair(Y, LoadZU); +} + +Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { + Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; + + switch (N) { + case 0: + IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x + : Intrinsic::r600_read_tidig_x; + break; + case 1: + IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y + : Intrinsic::r600_read_tidig_y; + break; + + case 2: + IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z + : Intrinsic::r600_read_tidig_z; + break; + default: + llvm_unreachable("invalid dimension"); + } + + Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); + CallInst *CI = Builder.CreateCall(WorkitemIdFn); + CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); + + return CI; } static VectorType *arrayTypeToVecType(Type *ArrayTy) { @@ -151,17 +390,16 @@ static bool canVectorizeInst(Instruction *Inst, User *User) { } static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { - Type *AllocaTy = Alloca->getAllocatedType(); + ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType()); - DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + DEBUG(dbgs() << "Alloca candidate for vectorization\n"); // FIXME: There is no reason why we can't support larger arrays, we // are just being conservative for now. - if (!AllocaTy->isArrayTy() || - AllocaTy->getArrayElementType()->isVectorTy() || - AllocaTy->getArrayNumElements() > 4) { - - DEBUG(dbgs() << " Cannot convert type to vector"); + if (!AllocaTy || + AllocaTy->getElementType()->isVectorTy() || + AllocaTy->getNumElements() > 4) { + DEBUG(dbgs() << " Cannot convert type to vector\n"); return false; } @@ -200,9 +438,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " << *VectorTy << '\n'); - for (std::vector<Value*>::iterator I = WorkList.begin(), - E = WorkList.end(); I != E; ++I) { - Instruction *Inst = cast<Instruction>(*I); + for (Value *V : WorkList) { + Instruction *Inst = cast<Instruction>(V); IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { @@ -239,44 +476,163 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { return true; } -static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { - bool Success = true; +static bool isCallPromotable(CallInst *CI) { + // TODO: We might be able to handle some cases where the callee is a + // constantexpr bitcast of a function. + if (!CI->getCalledFunction()) + return false; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); + if (!II) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::invariant_group_barrier: + case Intrinsic::objectsize: + return true; + default: + return false; + } +} + +bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, + Value *Val, + Instruction *Inst, + int OpIdx0, + int OpIdx1) const { + // Figure out which operand is the one we might not be promoting. + Value *OtherOp = Inst->getOperand(OpIdx0); + if (Val == OtherOp) + OtherOp = Inst->getOperand(OpIdx1); + + if (isa<ConstantPointerNull>(OtherOp)) + return true; + + Value *OtherObj = GetUnderlyingObject(OtherOp, *DL); + if (!isa<AllocaInst>(OtherObj)) + return false; + + // TODO: We should be able to replace undefs with the right pointer type. + + // TODO: If we know the other base object is another promotable + // alloca, not necessarily this alloca, we can do this. The + // important part is both must have the same address space at + // the end. + if (OtherObj != BaseAlloca) { + DEBUG(dbgs() << "Found a binary instruction with another alloca object\n"); + return false; + } + + return true; +} + +bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( + Value *BaseAlloca, + Value *Val, + std::vector<Value*> &WorkList) const { + for (User *User : Val->users()) { - if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) continue; + if (CallInst *CI = dyn_cast<CallInst>(User)) { - // TODO: We might be able to handle some cases where the callee is a - // constantexpr bitcast of a function. - if (!CI->getCalledFunction()) + if (!isCallPromotable(CI)) return false; WorkList.push_back(User); continue; } - // FIXME: Correctly handle ptrtoint instructions. - Instruction *UseInst = dyn_cast<Instruction>(User); - if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) + Instruction *UseInst = cast<Instruction>(User); + if (UseInst->getOpcode() == Instruction::PtrToInt) return false; - if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) { + if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) { + if (LI->isVolatile()) + return false; + + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) { + if (SI->isVolatile()) + return false; + // Reject if the stored value is not the pointer operand. if (SI->getPointerOperand() != Val) return false; + } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) { + if (RMW->isVolatile()) + return false; + } else if (AtomicCmpXchgInst *CAS + = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) { + if (CAS->isVolatile()) + return false; + } + + // Only promote a select if we know that the other select operand + // is from another pointer that will also be promoted. + if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) { + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1)) + return false; + + // May need to rewrite constant operands. + WorkList.push_back(ICmp); } if (!User->getType()->isPointerTy()) continue; - WorkList.push_back(User); + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) { + // Be conservative if an address could be computed outside the bounds of + // the alloca. + if (!GEP->isInBounds()) + return false; + } - Success &= collectUsesWithPtrTypes(User, WorkList); + // Only promote a select if we know that the other select operand is from + // another pointer that will also be promoted. + if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) { + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) + return false; + } + + // Repeat for phis. + if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) { + // TODO: Handle more complex cases. We should be able to replace loops + // over arrays. + switch (Phi->getNumIncomingValues()) { + case 1: + break; + case 2: + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1)) + return false; + break; + default: + return false; + } + } + + WorkList.push_back(User); + if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList)) + return false; } - return Success; + + return true; } -void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { - if (!I.isStaticAlloca()) +// FIXME: Should try to pick the most likely to be profitable allocas first. +void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { + // Array allocations are probably not worth handling, since an allocation of + // the array type is the canonical form. + if (!I.isStaticAlloca() || I.isArrayAllocation()) return; IRBuilder<> Builder(&I); @@ -286,95 +642,144 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) + if (tryPromoteAllocaToVector(&I)) { + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + return; + } + + const Function &ContainingFunction = *I.getParent()->getParent(); + + // Don't promote the alloca to LDS for shader calling conventions as the work + // item ID intrinsics are not supported for these calling conventions. + // Furthermore not all LDS is available for some of the stages. + if (AMDGPU::isShader(ContainingFunction.getCallingConv())) return; - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + // FIXME: We should also try to get this value from the reqd_work_group_size + // function attribute if it is available. + unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); - // FIXME: This is the maximum work group size. We should try to get - // value from the reqd_work_group_size function attribute if it is - // available. - unsigned WorkGroupSize = 256; - int AllocaSize = - WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); + const DataLayout &DL = Mod->getDataLayout(); - if (AllocaSize > LocalMemAvailable) { - DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + unsigned Align = I.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(I.getAllocatedType()); + + // FIXME: This computed padding is likely wrong since it depends on inverse + // usage order. + // + // FIXME: It is also possible that if we're allowed to use all of the memory + // could could end up using more than the maximum due to alignment padding. + + uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); + uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); + NewSize += AllocSize; + + if (NewSize > LocalMemLimit) { + DEBUG(dbgs() << " " << AllocSize + << " bytes of local memory not available to promote\n"); return; } + CurrentLocalMemUsage = NewSize; + std::vector<Value*> WorkList; - if (!collectUsesWithPtrTypes(&I, WorkList)) { + if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return; } DEBUG(dbgs() << "Promoting alloca to local memory\n"); - LocalMemAvailable -= AllocaSize; - Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); + Function *F = I.getParent()->getParent(); + + Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( - *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, - GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); - - FunctionType *FTy = FunctionType::get( - Type::getInt32Ty(Mod->getContext()), false); - AttributeSet AttrSet; - AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); - - Value *ReadLocalSizeY = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.y", FTy, AttrSet); - Value *ReadLocalSizeZ = Mod->getOrInsertFunction( - "llvm.r600.read.local.size.z", FTy, AttrSet); - Value *ReadTIDIGX = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.x", FTy, AttrSet); - Value *ReadTIDIGY = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.y", FTy, AttrSet); - Value *ReadTIDIGZ = Mod->getOrInsertFunction( - "llvm.r600.read.tidig.z", FTy, AttrSet); - - Value *TCntY = Builder.CreateCall(ReadLocalSizeY, {}); - Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ, {}); - Value *TIdX = Builder.CreateCall(ReadTIDIGX, {}); - Value *TIdY = Builder.CreateCall(ReadTIDIGY, {}); - Value *TIdZ = Builder.CreateCall(ReadTIDIGZ, {}); - - Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + *Mod, GVTy, false, GlobalValue::InternalLinkage, + UndefValue::get(GVTy), + Twine(F->getName()) + Twine('.') + I.getName(), + nullptr, + GlobalVariable::NotThreadLocal, + AMDGPUAS::LOCAL_ADDRESS); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + GV->setAlignment(I.getAlignment()); + + Value *TCntY, *TCntZ; + + std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); + Value *TIdX = getWorkitemID(Builder, 0); + Value *TIdY = getWorkitemID(Builder, 1); + Value *TIdZ = getWorkitemID(Builder, 2); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true); Tmp0 = Builder.CreateMul(Tmp0, TIdX); - Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true); Value *TID = Builder.CreateAdd(Tmp0, Tmp1); TID = Builder.CreateAdd(TID, TIdZ); - std::vector<Value*> Indices; - Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); - Indices.push_back(TID); + Value *Indices[] = { + Constant::getNullValue(Type::getInt32Ty(Mod->getContext())), + TID + }; - Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); + Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); I.mutateType(Offset->getType()); I.replaceAllUsesWith(Offset); I.eraseFromParent(); - for (std::vector<Value*>::iterator i = WorkList.begin(), - e = WorkList.end(); i != e; ++i) { - Value *V = *i; + for (Value *V : WorkList) { CallInst *Call = dyn_cast<CallInst>(V); if (!Call) { - Type *EltTy = V->getType()->getPointerElementType(); - PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) { + Value *Src0 = CI->getOperand(0); + Type *EltTy = Src0->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + + if (isa<ConstantPointerNull>(CI->getOperand(0))) + CI->setOperand(0, ConstantPointerNull::get(NewTy)); + + if (isa<ConstantPointerNull>(CI->getOperand(1))) + CI->setOperand(1, ConstantPointerNull::get(NewTy)); + + continue; + } // The operand's value should be corrected on its own. if (isa<AddrSpaceCastInst>(V)) continue; + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + // FIXME: It doesn't really make sense to try to do this for all // instructions. V->mutateType(NewTy); + + // Adjust the types of any constant operands. + if (SelectInst *SI = dyn_cast<SelectInst>(V)) { + if (isa<ConstantPointerNull>(SI->getOperand(1))) + SI->setOperand(1, ConstantPointerNull::get(NewTy)); + + if (isa<ConstantPointerNull>(SI->getOperand(2))) + SI->setOperand(2, ConstantPointerNull::get(NewTy)); + } else if (PHINode *Phi = dyn_cast<PHINode>(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { + if (isa<ConstantPointerNull>(Phi->getIncomingValue(I))) + Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy)); + } + } + continue; } IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); if (!Intr) { + // FIXME: What is this for? It doesn't make sense to promote arbitrary + // function calls. If the call is to a defined function that can also be + // promoted, we should be able to do this once that function is also + // rewritten. + std::vector<Type*> ArgTypes; for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); ArgIdx != ArgEnd; ++ArgIdx) { @@ -405,6 +810,14 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Intr->eraseFromParent(); continue; } + case Intrinsic::memmove: { + MemMoveInst *MemMove = cast<MemMoveInst>(Intr); + Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(), + MemMove->getLength(), MemMove->getAlignment(), + MemMove->isVolatile()); + Intr->eraseFromParent(); + continue; + } case Intrinsic::memset: { MemSetInst *MemSet = cast<MemSetInst>(Intr); Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), @@ -413,6 +826,28 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { Intr->eraseFromParent(); continue; } + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::invariant_group_barrier: + Intr->eraseFromParent(); + // FIXME: I think the invariant marker should still theoretically apply, + // but the intrinsics need to be changed to accept pointers with any + // address space. + continue; + case Intrinsic::objectsize: { + Value *Src = Intr->getOperand(0); + Type *SrcTy = Src->getType()->getPointerElementType(); + Function *ObjectSize = Intrinsic::getDeclaration(Mod, + Intrinsic::objectsize, + { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) } + ); + + CallInst *NewCall + = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) }); + Intr->replaceAllUsesWith(NewCall); + Intr->eraseFromParent(); + continue; + } default: Intr->dump(); llvm_unreachable("Don't know how to promote alloca intrinsic use."); @@ -420,6 +855,6 @@ void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { } } -FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { - return new AMDGPUPromoteAlloca(ST); +FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) { + return new AMDGPUPromoteAlloca(TM); } diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp index 3ca0eca3417f..941f2d8a468a 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -24,20 +24,14 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {} // they are not supported at this time. //===----------------------------------------------------------------------===// -const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister; +// Dummy to not crash RegisterClassInfo. +static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; -const MCPhysReg* -AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { +const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs( + const MachineFunction *) const { return &CalleeSavedReg; } -void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, - int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const { - llvm_unreachable("Subroutines not supported yet"); -} - unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const { return AMDGPU::NoRegister; } @@ -54,10 +48,5 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const { return SubRegs[Channel]; } -unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const { - - return getSubRegFromChannel(IndirectIndex); -} - #define GET_REGINFO_TARGET_DESC #include "AMDGPUGenRegisterInfo.inc" diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h index 0344834328f6..ef51aad95dce 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -13,10 +13,9 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUREGISTERINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H -#include "llvm/ADT/BitVector.h" #include "llvm/Target/TargetRegisterInfo.h" #define GET_REGINFO_HEADER @@ -29,30 +28,14 @@ class AMDGPUSubtarget; class TargetInstrInfo; struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { - static const MCPhysReg CalleeSavedReg; - AMDGPURegisterInfo(); - BitVector getReservedRegs(const MachineFunction &MF) const override { - assert(!"Unimplemented"); return BitVector(); - } - - virtual unsigned getHWRegIndex(unsigned Reg) const { - assert(!"Unimplemented"); return 0; - } - /// \returns the sub reg enum value for the given \p Channel /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0) unsigned getSubRegFromChannel(unsigned Channel) const; const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS) const override; unsigned getFrameRegister(const MachineFunction &MF) const override; - - unsigned getIndirectSubReg(unsigned IndirectIndex) const; - }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h new file mode 100644 index 000000000000..40f639434507 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h @@ -0,0 +1,138 @@ +//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// Enums and structure types used by runtime metadata. +/// +/// Runtime requests certain information (metadata) about kernels to be able +/// to execute the kernels and answer the queries about the kernels. +/// The metadata is represented as a byte stream in an ELF section of a +/// binary (code object). The byte stream consists of key-value pairs. +/// Each key is an 8 bit unsigned integer. Each value can be an integer, +/// a string, or a stream of key-value pairs. There are 3 levels of key-value +/// pair streams. At the beginning of the ELF section is the top level +/// key-value pair stream. A kernel-level key-value pair stream starts after +/// encountering KeyKernelBegin and ends immediately before encountering +/// KeyKernelEnd. A kernel-argument-level key-value pair stream starts +/// after encountering KeyArgBegin and ends immediately before encountering +/// KeyArgEnd. A kernel-level key-value pair stream can only appear in a top +/// level key-value pair stream. A kernel-argument-level key-value pair stream +/// can only appear in a kernel-level key-value pair stream. +/// +/// The format should be kept backward compatible. New enum values and bit +/// fields should be appended at the end. It is suggested to bump up the +/// revision number whenever the format changes and document the change +/// in the revision in this header. +/// +// +//===----------------------------------------------------------------------===// +// +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H + +#include <stdint.h> + +namespace AMDGPU { + +namespace RuntimeMD { + + // Version and revision of runtime metadata + const unsigned char MDVersion = 1; + const unsigned char MDRevision = 0; + + // ELF section name containing runtime metadata + const char SectionName[] = ".AMDGPU.runtime_metadata"; + + // Enumeration values of keys in runtime metadata. + enum Key { + KeyNull = 0, // Place holder. Ignored when encountered + KeyMDVersion = 1, // Runtime metadata version + KeyLanguage = 2, // Language + KeyLanguageVersion = 3, // Language version + KeyKernelBegin = 4, // Beginning of kernel-level stream + KeyKernelEnd = 5, // End of kernel-level stream + KeyKernelName = 6, // Kernel name + KeyArgBegin = 7, // Beginning of kernel-arg-level stream + KeyArgEnd = 8, // End of kernel-arg-level stream + KeyArgSize = 9, // Kernel arg size + KeyArgAlign = 10, // Kernel arg alignment + KeyArgTypeName = 11, // Kernel type name + KeyArgName = 12, // Kernel name + KeyArgTypeKind = 13, // Kernel argument type kind + KeyArgValueType = 14, // Kernel argument value type + KeyArgAddrQual = 15, // Kernel argument address qualifier + KeyArgAccQual = 16, // Kernel argument access qualifier + KeyArgIsConst = 17, // Kernel argument is const qualified + KeyArgIsRestrict = 18, // Kernel argument is restrict qualified + KeyArgIsVolatile = 19, // Kernel argument is volatile qualified + KeyArgIsPipe = 20, // Kernel argument is pipe qualified + KeyReqdWorkGroupSize = 21, // Required work group size + KeyWorkGroupSizeHint = 22, // Work group size hint + KeyVecTypeHint = 23, // Vector type hint + KeyKernelIndex = 24, // Kernel index for device enqueue + KeySGPRs = 25, // Number of SGPRs + KeyVGPRs = 26, // Number of VGPRs + KeyMinWavesPerSIMD = 27, // Minimum number of waves per SIMD + KeyMaxWavesPerSIMD = 28, // Maximum number of waves per SIMD + KeyFlatWorkGroupSizeLimits = 29, // Flat work group size limits + KeyMaxWorkGroupSize = 30, // Maximum work group size + KeyNoPartialWorkGroups = 31, // No partial work groups + }; + + enum Language : uint8_t { + OpenCL_C = 0, + HCC = 1, + OpenMP = 2, + OpenCL_CPP = 3, +}; + + enum LanguageVersion : uint16_t { + V100 = 100, + V110 = 110, + V120 = 120, + V200 = 200, + V210 = 210, + }; + + namespace KernelArg { + enum TypeKind : uint8_t { + Value = 0, + Pointer = 1, + Image = 2, + Sampler = 3, + Queue = 4, + }; + + enum ValueType : uint16_t { + Struct = 0, + I8 = 1, + U8 = 2, + I16 = 3, + U16 = 4, + F16 = 5, + I32 = 6, + U32 = 7, + F32 = 8, + I64 = 9, + U64 = 10, + F64 = 11, + }; + + enum AccessQualifer : uint8_t { + None = 0, + ReadOnly = 1, + WriteOnly = 2, + ReadWrite = 3, + }; + } // namespace KernelArg +} // namespace RuntimeMD +} // namespace AMDGPU + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 7d70fa73da29..10fa9cf46737 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -15,7 +15,6 @@ #include "AMDGPUSubtarget.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" -#include "R600MachineScheduler.h" #include "SIFrameLowering.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" @@ -32,6 +31,8 @@ using namespace llvm; #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" +AMDGPUSubtarget::~AMDGPUSubtarget() {} + AMDGPUSubtarget & AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { @@ -44,14 +45,11 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // for SI has the unhelpful behavior that it unsets everything else if you // disable it. - SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,"); if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. - FullFS += "+flat-for-global,"; + FullFS += "+flat-for-global,+unaligned-buffer-access,"; FullFS += FS; - if (GPU == "" && TT.getArch() == Triple::amdgcn) - GPU = "SI"; - ParseSubtargetFeatures(GPU, FullFS); // FIXME: I don't think think Evergreen has any useful support for @@ -61,52 +59,142 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, FP32Denormals = false; FP64Denormals = false; } + + // Set defaults if needed. + if (MaxPrivateElementSize == 0) + MaxPrivateElementSize = 4; + return *this; } AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - TargetMachine &TM) - : AMDGPUGenSubtargetInfo(TT, GPU, FS), DevName(GPU), Is64bit(false), - DumpCode(false), R600ALUInst(false), HasVertexCache(false), - TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), - FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), - EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), - EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), - EnableXNACK(false), - WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), - EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), - GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), - IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), - EnableSIScheduler(false), FrameLowering(nullptr), - InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { - + const TargetMachine &TM) + : AMDGPUGenSubtargetInfo(TT, GPU, FS), + TargetTriple(TT), + Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), + IsaVersion(ISAVersion0_0_0), + WavefrontSize(64), + LocalMemorySize(0), + LDSBankCount(0), + MaxPrivateElementSize(0), + + FastFMAF32(false), + HalfRate64Ops(false), + + FP32Denormals(false), + FP64Denormals(false), + FPExceptions(false), + FlatForGlobal(false), + UnalignedBufferAccess(false), + + EnableXNACK(false), + DebuggerInsertNops(false), + DebuggerReserveRegs(false), + DebuggerEmitPrologue(false), + + EnableVGPRSpilling(false), + EnablePromoteAlloca(false), + EnableLoadStoreOpt(false), + EnableUnsafeDSOffsetFolding(false), + EnableSIScheduler(false), + DumpCode(false), + + FP64(false), + IsGCN(false), + GCN1Encoding(false), + GCN3Encoding(false), + CIInsts(false), + SGPRInitBug(false), + HasSMemRealTime(false), + Has16BitInsts(false), + FlatAddressSpace(false), + + R600ALUInst(false), + CaymanISA(false), + CFALUBug(false), + HasVertexCache(false), + TexVTXClauseSize(0), + + FeatureDisable(false), + InstrItins(getInstrItineraryForCPU(GPU)) { initializeSubtargetDependencies(TT, GPU, FS); +} - const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16) - - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - InstrInfo.reset(new R600InstrInfo(*this)); - TLInfo.reset(new R600TargetLowering(TM, *this)); - - // FIXME: Should have R600 specific FrameLowering - FrameLowering.reset(new AMDGPUFrameLowering( - TargetFrameLowering::StackGrowsUp, - MaxStackAlign, - 0)); - } else { - InstrInfo.reset(new SIInstrInfo(*this)); - TLInfo.reset(new SITargetLowering(TM, *this)); - FrameLowering.reset(new SIFrameLowering( - TargetFrameLowering::StackGrowsUp, - MaxStackAlign, - 0)); +// FIXME: These limits are for SI. Did they change with the larger maximum LDS +// size? +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { + switch (NWaves) { + case 10: + return 1638; + case 9: + return 1820; + case 8: + return 2048; + case 7: + return 2340; + case 6: + return 2730; + case 5: + return 3276; + case 4: + return 4096; + case 3: + return 5461; + case 2: + return 8192; + default: + return getLocalMemorySize(); } } -unsigned AMDGPUSubtarget::getStackEntrySize() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - switch(getWavefrontSize()) { +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { + if (Bytes <= 1638) + return 10; + + if (Bytes <= 1820) + return 9; + + if (Bytes <= 2048) + return 8; + + if (Bytes <= 2340) + return 7; + + if (Bytes <= 2730) + return 6; + + if (Bytes <= 3276) + return 5; + + if (Bytes <= 4096) + return 4; + + if (Bytes <= 5461) + return 3; + + if (Bytes <= 8192) + return 2; + + return 1; +} + +R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM) : + AMDGPUSubtarget(TT, GPU, FS, TM), + InstrInfo(*this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), + TLInfo(TM, *this) {} + +SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM) : + AMDGPUSubtarget(TT, GPU, FS, TM), + InstrInfo(*this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), + TLInfo(TM, *this), + GISel() {} + +unsigned R600Subtarget::getStackEntrySize() const { + switch (getWavefrontSize()) { case 16: return 8; case 32: @@ -118,37 +206,36 @@ unsigned AMDGPUSubtarget::getStackEntrySize() const { } } -unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { - switch(getGeneration()) { - default: llvm_unreachable("ChipID unknown"); - case SEA_ISLANDS: return 12; - } -} - -AMDGPU::IsaVersion AMDGPUSubtarget::getIsaVersion() const { - return AMDGPU::getIsaVersion(getFeatureBits()); +void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const { + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + + // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. + if (!enableSIScheduler()) + Policy.ShouldTrackLaneMasks = true; } -bool AMDGPUSubtarget::isVGPRSpillingEnabled( - const SIMachineFunctionInfo *MFI) const { - return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const { + return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } -void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, - MachineInstr *end, - unsigned NumRegionInstrs) const { - if (getGeneration() >= SOUTHERN_ISLANDS) { - - // Track register pressure so the scheduler can try to decrease - // pressure once register usage is above the threshold defined by - // SIRegisterInfo::getRegPressureSetLimit() - Policy.ShouldTrackPressure = true; - - // Enabling both top down and bottom up scheduling seems to give us less - // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; +unsigned SISubtarget::getAmdKernelCodeChipID() const { + switch (getGeneration()) { + case SEA_ISLANDS: + return 12; + default: + llvm_unreachable("ChipID unknown"); } } +AMDGPU::IsaVersion SISubtarget::getIsaVersion() const { + return AMDGPU::getIsaVersion(getFeatureBits()); +} diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 49c94f1eceb8..3fe61aa449e0 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -16,12 +16,14 @@ #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H #include "AMDGPU.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" -#include "AMDGPUISelLowering.h" -#include "AMDGPUSubtarget.h" +#include "R600InstrInfo.h" +#include "R600ISelLowering.h" +#include "R600FrameLowering.h" +#include "SIInstrInfo.h" +#include "SIISelLowering.h" +#include "SIFrameLowering.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" #include "llvm/Target/TargetSubtargetInfo.h" #define GET_SUBTARGETINFO_HEADER @@ -30,9 +32,9 @@ namespace llvm { class SIMachineFunctionInfo; +class StringRef; class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { - public: enum Generation { R600 = 0, @@ -45,10 +47,6 @@ public: }; enum { - FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 - }; - - enum { ISAVersion0_0_0, ISAVersion7_0_0, ISAVersion7_0_1, @@ -57,114 +55,116 @@ public: ISAVersion8_0_3 }; -private: - std::string DevName; - bool Is64bit; - bool DumpCode; - bool R600ALUInst; - bool HasVertexCache; - short TexVTXClauseSize; +protected: + // Basic subtarget description. + Triple TargetTriple; Generation Gen; - bool FP64; - bool FP64Denormals; - bool FP32Denormals; + unsigned IsaVersion; + unsigned WavefrontSize; + int LocalMemorySize; + int LDSBankCount; + unsigned MaxPrivateElementSize; + + // Possibly statically set by tablegen, but may want to be overridden. bool FastFMAF32; - bool CaymanISA; - bool FlatAddressSpace; + bool HalfRate64Ops; + + // Dynamially set bits that enable features. + bool FP32Denormals; + bool FP64Denormals; + bool FPExceptions; bool FlatForGlobal; - bool EnableIRStructurizer; + bool UnalignedBufferAccess; + bool EnableXNACK; + bool DebuggerInsertNops; + bool DebuggerReserveRegs; + bool DebuggerEmitPrologue; + + // Used as options. + bool EnableVGPRSpilling; bool EnablePromoteAlloca; - bool EnableIfCvt; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; - bool EnableXNACK; - unsigned WavefrontSize; - bool CFALUBug; - int LocalMemorySize; - bool EnableVGPRSpilling; - bool SGPRInitBug; + bool EnableSIScheduler; + bool DumpCode; + + // Subtarget statically properties set by tablegen + bool FP64; bool IsGCN; bool GCN1Encoding; bool GCN3Encoding; bool CIInsts; + bool SGPRInitBug; + bool HasSMemRealTime; + bool Has16BitInsts; + bool FlatAddressSpace; + bool R600ALUInst; + bool CaymanISA; + bool CFALUBug; + bool HasVertexCache; + short TexVTXClauseSize; + + // Dummy feature to use for assembler in tablegen. bool FeatureDisable; - int LDSBankCount; - unsigned IsaVersion; - bool EnableHugeScratchBuffer; - bool EnableSIScheduler; - std::unique_ptr<AMDGPUFrameLowering> FrameLowering; - std::unique_ptr<AMDGPUTargetLowering> TLInfo; - std::unique_ptr<AMDGPUInstrInfo> InstrInfo; InstrItineraryData InstrItins; - Triple TargetTriple; public: - AMDGPUSubtarget(const Triple &TT, StringRef CPU, StringRef FS, - TargetMachine &TM); + AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM); + virtual ~AMDGPUSubtarget(); AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS); - const AMDGPUFrameLowering *getFrameLowering() const override { - return FrameLowering.get(); - } - const AMDGPUInstrInfo *getInstrInfo() const override { - return InstrInfo.get(); - } - const AMDGPURegisterInfo *getRegisterInfo() const override { - return &InstrInfo->getRegisterInfo(); - } - AMDGPUTargetLowering *getTargetLowering() const override { - return TLInfo.get(); - } + const AMDGPUInstrInfo *getInstrInfo() const override; + const AMDGPUFrameLowering *getFrameLowering() const override; + const AMDGPUTargetLowering *getTargetLowering() const override; + const AMDGPURegisterInfo *getRegisterInfo() const override; + const InstrItineraryData *getInstrItineraryData() const override { return &InstrItins; } void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - bool is64bit() const { - return Is64bit; - } - - bool hasVertexCache() const { - return HasVertexCache; - } - - short getTexVTXClauseSize() const { - return TexVTXClauseSize; + bool isAmdHsaOS() const { + return TargetTriple.getOS() == Triple::AMDHSA; } Generation getGeneration() const { return Gen; } - bool hasHWFP64() const { - return FP64; + unsigned getWavefrontSize() const { + return WavefrontSize; } - bool hasCaymanISA() const { - return CaymanISA; + int getLocalMemorySize() const { + return LocalMemorySize; } - bool hasFP32Denormals() const { - return FP32Denormals; + int getLDSBankCount() const { + return LDSBankCount; } - bool hasFP64Denormals() const { - return FP64Denormals; + unsigned getMaxPrivateElementSize() const { + return MaxPrivateElementSize; + } + + bool hasHWFP64() const { + return FP64; } bool hasFastFMAF32() const { return FastFMAF32; } - bool hasFlatAddressSpace() const { - return FlatAddressSpace; + bool hasHalfRate64Ops() const { + return HalfRate64Ops; } - bool useFlatForGlobal() const { - return FlatForGlobal; + bool hasAddr64() const { + return (getGeneration() < VOLCANIC_ISLANDS); } bool hasBFE() const { @@ -214,116 +214,249 @@ public: return (getGeneration() >= EVERGREEN); } - bool IsIRStructurizerEnabled() const { - return EnableIRStructurizer; + bool hasCaymanISA() const { + return CaymanISA; } bool isPromoteAllocaEnabled() const { return EnablePromoteAlloca; } - bool isIfCvtEnabled() const { - return EnableIfCvt; + bool unsafeDSOffsetFoldingEnabled() const { + return EnableUnsafeDSOffsetFolding; } - bool loadStoreOptEnabled() const { - return EnableLoadStoreOpt; + bool dumpCode() const { + return DumpCode; } - bool unsafeDSOffsetFoldingEnabled() const { - return EnableUnsafeDSOffsetFolding; + /// Return the amount of LDS that can be used that will not restrict the + /// occupancy lower than WaveCount. + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; + + /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if + /// the given LDS memory size is the only constraint. + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + + + bool hasFP32Denormals() const { + return FP32Denormals; } - unsigned getWavefrontSize() const { - return WavefrontSize; + bool hasFP64Denormals() const { + return FP64Denormals; } - unsigned getStackEntrySize() const; + bool hasFPExceptions() const { + return FPExceptions; + } - bool hasCFAluBug() const { - assert(getGeneration() <= NORTHERN_ISLANDS); - return CFALUBug; + bool useFlatForGlobal() const { + return FlatForGlobal; } - int getLocalMemorySize() const { - return LocalMemorySize; + bool hasUnalignedBufferAccess() const { + return UnalignedBufferAccess; } - bool hasSGPRInitBug() const { - return SGPRInitBug; + bool isXNACKEnabled() const { + return EnableXNACK; } - int getLDSBankCount() const { - return LDSBankCount; + unsigned getMaxWavesPerCU() const { + if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) + return 10; + + // FIXME: Not sure what this is for other subtagets. + return 8; } - unsigned getAmdKernelCodeChipID() const; + /// \brief Returns the offset in bytes from the start of the input buffer + /// of the first explicit kernel argument. + unsigned getExplicitKernelArgOffset() const { + return isAmdHsaOS() ? 0 : 36; + } - AMDGPU::IsaVersion getIsaVersion() const; + unsigned getStackAlignment() const { + // Scratch is allocated in 256 dword per wave blocks. + return 4 * 256 / getWavefrontSize(); + } bool enableMachineScheduler() const override { return true; } - void overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, MachineInstr *end, - unsigned NumRegionInstrs) const override; + bool enableSubRegLiveness() const override { + return true; + } +}; - // Helper functions to simplify if statements - bool isTargetELF() const { - return false; +class R600Subtarget final : public AMDGPUSubtarget { +private: + R600InstrInfo InstrInfo; + R600FrameLowering FrameLowering; + R600TargetLowering TLInfo; + +public: + R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); + + const R600InstrInfo *getInstrInfo() const override { + return &InstrInfo; } - StringRef getDeviceName() const { - return DevName; + const R600FrameLowering *getFrameLowering() const override { + return &FrameLowering; } - bool enableHugeScratchBuffer() const { - return EnableHugeScratchBuffer; + const R600TargetLowering *getTargetLowering() const override { + return &TLInfo; } - bool enableSIScheduler() const { - return EnableSIScheduler; + const R600RegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); } - bool dumpCode() const { - return DumpCode; + bool hasCFAluBug() const { + return CFALUBug; } - bool r600ALUEncoding() const { - return R600ALUInst; + + bool hasVertexCache() const { + return HasVertexCache; } - bool isAmdHsaOS() const { - return TargetTriple.getOS() == Triple::AMDHSA; + + short getTexVTXClauseSize() const { + return TexVTXClauseSize; } - bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; - bool isXNACKEnabled() const { - return EnableXNACK; + unsigned getStackEntrySize() const; +}; + +class SISubtarget final : public AMDGPUSubtarget { +public: + enum { + FIXED_SGPR_COUNT_FOR_INIT_BUG = 80 + }; + +private: + SIInstrInfo InstrInfo; + SIFrameLowering FrameLowering; + SITargetLowering TLInfo; + std::unique_ptr<GISelAccessor> GISel; + +public: + SISubtarget(const Triple &TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); + + const SIInstrInfo *getInstrInfo() const override { + return &InstrInfo; } - unsigned getMaxWavesPerCU() const { - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 10; + const SIFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } - // FIXME: Not sure what this is for other subtagets. - llvm_unreachable("do not know max waves per CU for this subtarget."); + const SITargetLowering *getTargetLowering() const override { + return &TLInfo; } - bool enableSubRegLiveness() const override { - return true; + const CallLowering *getCallLowering() const override { + assert(GISel && "Access to GlobalISel APIs not set"); + return GISel->getCallLowering(); } - /// \brief Returns the offset in bytes from the start of the input buffer - /// of the first explicit kernel argument. - unsigned getExplicitKernelArgOffset() const { - return isAmdHsaOS() ? 0 : 36; + const SIRegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); + } + + void setGISelAccessor(GISelAccessor &GISel) { + this->GISel.reset(&GISel); } + void overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const override; + + bool isVGPRSpillingEnabled(const Function& F) const; + + unsigned getAmdKernelCodeChipID() const; + + AMDGPU::IsaVersion getIsaVersion() const; + unsigned getMaxNumUserSGPRs() const { return 16; } + + bool hasFlatAddressSpace() const { + return FlatAddressSpace; + } + + bool hasSMemRealTime() const { + return HasSMemRealTime; + } + + bool has16BitInsts() const { + return Has16BitInsts; + } + + bool enableSIScheduler() const { + return EnableSIScheduler; + } + + bool debuggerSupported() const { + return debuggerInsertNops() && debuggerReserveRegs() && + debuggerEmitPrologue(); + } + + bool debuggerInsertNops() const { + return DebuggerInsertNops; + } + + bool debuggerReserveRegs() const { + return DebuggerReserveRegs; + } + + bool debuggerEmitPrologue() const { + return DebuggerEmitPrologue; + } + + bool loadStoreOptEnabled() const { + return EnableLoadStoreOpt; + } + + bool hasSGPRInitBug() const { + return SGPRInitBug; + } }; + +inline const AMDGPUInstrInfo *AMDGPUSubtarget::getInstrInfo() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast<const SISubtarget *>(this)->getInstrInfo(); + + return static_cast<const R600Subtarget *>(this)->getInstrInfo(); +} + +inline const AMDGPUFrameLowering *AMDGPUSubtarget::getFrameLowering() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast<const SISubtarget *>(this)->getFrameLowering(); + + return static_cast<const R600Subtarget *>(this)->getFrameLowering(); +} + +inline const AMDGPUTargetLowering *AMDGPUSubtarget::getTargetLowering() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast<const SISubtarget *>(this)->getTargetLowering(); + + return static_cast<const R600Subtarget *>(this)->getTargetLowering(); +} + +inline const AMDGPURegisterInfo *AMDGPUSubtarget::getRegisterInfo() const { + if (getGeneration() >= SOUTHERN_ISLANDS) + return static_cast<const SISubtarget *>(this)->getRegisterInfo(); + + return static_cast<const R600Subtarget *>(this)->getRegisterInfo(); +} + } // End namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 519ae5cc748d..3e53f52c689f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -14,19 +14,23 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" -#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" +#include "AMDGPUCallLowering.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" + #include "llvm/Analysis/Passes.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/IR/LegacyPassManager.h" @@ -34,10 +38,35 @@ #include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" -#include <llvm/CodeGen/Passes.h> +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Vectorize.h" using namespace llvm; +static cl::opt<bool> EnableR600StructurizeCFG( + "r600-ir-structurize", + cl::desc("Use StructurizeCFG IR pass"), + cl::init(true)); + +static cl::opt<bool> EnableSROA( + "amdgpu-sroa", + cl::desc("Run SROA after promote alloca pass"), + cl::ReallyHidden, + cl::init(true)); + +static cl::opt<bool> EnableR600IfConvert( + "r600-if-convert", + cl::desc("Use if conversion pass"), + cl::ReallyHidden, + cl::init(true)); + +// Option to disable vectorizer for tests. +static cl::opt<bool> EnableLoadStoreVectorizer( + "amdgpu-load-store-vectorizer", + cl::desc("Enable load store vectorizer"), + cl::init(false), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); @@ -47,17 +76,22 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSILowerI1CopiesPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFoldOperandsPass(*PR); - initializeSIFixSGPRLiveRangesPass(*PR); + initializeSIShrinkInstructionsPass(*PR); initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); + initializeAMDGPUPromoteAllocaPass(*PR); + initializeAMDGPUCodeGenPreparePass(*PR); + initializeSIAnnotateControlFlowPass(*PR); + initializeSIDebuggerInsertNopsPass(*PR); + initializeSIInsertWaitsPass(*PR); + initializeSIWholeQuadModePass(*PR); + initializeSILowerControlFlowPass(*PR); + initializeSIDebuggerInsertNopsPass(*PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { - if (TT.getOS() == Triple::AMDHSA) - return make_unique<AMDGPUHSATargetObjectFile>(); - return make_unique<AMDGPUTargetObjectFile>(); } @@ -73,60 +107,156 @@ static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler", createSIMachineScheduler); -static std::string computeDataLayout(const Triple &TT) { - std::string Ret = "e-p:32:32"; - - if (TT.getArch() == Triple::amdgcn) { - // 32-bit private, local, and region pointers. 64-bit global and constant. - Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64"; +static StringRef computeDataLayout(const Triple &TT) { + if (TT.getArch() == Triple::r600) { + // 32-bit pointers. + return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } - Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256" - "-v512:512-v1024:1024-v2048:2048-n32:64"; + // 32-bit private, local, and region pointers. 64-bit global, constant and + // flat. + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" + "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; +} + +LLVM_READNONE +static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) { + if (!GPU.empty()) + return GPU; - return Ret; + // HSA only supports CI+, so change the default GPU to a CI for HSA. + if (TT.getArch() == Triple::amdgcn) + return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti"; + + return "r600"; +} + +static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) { + // The AMDGPU toolchain only supports generating shared objects, so we + // must always use PIC. + return Reloc::PIC_; } AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, - TargetOptions Options, Reloc::Model RM, + TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OptLevel) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, - OptLevel), - TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this), - IntrinsicInfo() { + : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), + FS, Options, getEffectiveRelocModel(RM), CM, OptLevel), + TLOF(createTLOF(getTargetTriple())), + IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } AMDGPUTargetMachine::~AMDGPUTargetMachine() { } +StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { + Attribute GPUAttr = F.getFnAttribute("target-cpu"); + return GPUAttr.hasAttribute(Attribute::None) ? + getTargetCPU() : GPUAttr.getValueAsString(); +} + +StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const { + Attribute FSAttr = F.getFnAttribute("target-features"); + + return FSAttr.hasAttribute(Attribute::None) ? + getTargetFeatureString() : + FSAttr.getValueAsString(); +} + //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, + StringRef CPU, StringRef FS, + TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +const R600Subtarget *R600TargetMachine::getSubtargetImpl( + const Function &F) const { + StringRef GPU = getGPUName(F); + StringRef FS = getFeatureString(F); + + SmallString<128> SubtargetKey(GPU); + SubtargetKey.append(FS); + + auto &I = SubtargetMap[SubtargetKey]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this); + } + + return I.get(); +} //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { +struct SIGISelActualAccessor : public GISelAccessor { + std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + const AMDGPUCallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } +}; +} // End anonymous namespace. +#endif + GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, - StringRef FS, StringRef CPU, - TargetOptions Options, Reloc::Model RM, + StringRef CPU, StringRef FS, + TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : AMDGPUTargetMachine(T, TT, FS, CPU, Options, RM, CM, OL) {} + : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {} + +const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { + StringRef GPU = getGPUName(F); + StringRef FS = getFeatureString(F); + + SmallString<128> SubtargetKey(GPU); + SubtargetKey.append(FS); + + auto &I = SubtargetMap[SubtargetKey]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); + +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); + GISel->CallLoweringInfo.reset( + new AMDGPUCallLowering(*I->getTargetLowering())); +#endif + + I->setGISelAccessor(*GISel); + } + + return I.get(); +} //===----------------------------------------------------------------------===// // AMDGPU Pass Setup //===----------------------------------------------------------------------===// namespace { + class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) @@ -142,16 +272,8 @@ public: return getTM<AMDGPUTargetMachine>(); } - ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - return createR600MachineScheduler(C); - else if (ST.enableSIScheduler()) - return createSIMachineScheduler(C); - return nullptr; - } - + void addEarlyCSEOrGVNPass(); + void addStraightLineScalarOptimizationPasses(); void addIRPasses() override; void addCodeGenPrepare() override; bool addPreISel() override; @@ -159,27 +281,44 @@ public: bool addGCPasses() override; }; -class R600PassConfig : public AMDGPUPassConfig { +class R600PassConfig final : public AMDGPUPassConfig { public: R600PassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } + ScheduleDAGInstrs *createMachineScheduler( + MachineSchedContext *C) const override { + return createR600MachineScheduler(C); + } + bool addPreISel() override; void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; -class GCNPassConfig : public AMDGPUPassConfig { +class GCNPassConfig final : public AMDGPUPassConfig { public: GCNPassConfig(TargetMachine *TM, PassManagerBase &PM) : AMDGPUPassConfig(TM, PM) { } + + GCNTargetMachine &getGCNTargetMachine() const { + return getTM<GCNTargetMachine>(); + } + + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override; + bool addPreISel() override; + void addMachineSSAOptimization() override; bool addInstSelector() override; +#ifdef LLVM_BUILD_GLOBAL_ISEL + bool addIRTranslator() override; + bool addRegBankSelect() override; +#endif void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; - void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; }; @@ -188,12 +327,39 @@ public: TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { return TargetIRAnalysis([this](const Function &F) { - return TargetTransformInfo( - AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); + return TargetTransformInfo(AMDGPUTTIImpl(this, F)); }); } +void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createGVNPass()); + else + addPass(createEarlyCSEPass()); +} + +void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + addPass(createSeparateConstOffsetFromGEPPass()); + addPass(createSpeculativeExecutionPass()); + // ReassociateGEPs exposes more opportunites for SLSR. See + // the example in reassociate-geps-and-slsr.ll. + addPass(createStraightLineStrengthReducePass()); + // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or + // EarlyCSE can reuse. + addEarlyCSEOrGVNPass(); + // Run NaryReassociate after EarlyCSE/GVN to be more effective. + addPass(createNaryReassociatePass()); + // NaryReassociate on GEPs creates redundant common expressions, so run + // EarlyCSE after it. + addPass(createEarlyCSEPass()); +} + void AMDGPUPassConfig::addIRPasses() { + // There is no reason to run these. + disablePass(&StackMapLivenessID); + disablePass(&FuncletLayoutID); + disablePass(&PatchableFunctionID); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerPass()); @@ -207,24 +373,43 @@ void AMDGPUPassConfig::addIRPasses() { // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); + if (TM.getOptLevel() > CodeGenOpt::None) { + addPass(createAMDGPUPromoteAlloca(&TM)); + + if (EnableSROA) + addPass(createSROAPass()); + } + + addStraightLineScalarOptimizationPasses(); + TargetPassConfig::addIRPasses(); + + // EarlyCSE is not always strong enough to clean up what LSR produces. For + // example, GVN can combine + // + // %0 = add %a, %b + // %1 = add %b, %a + // + // and + // + // %0 = shl nsw %a, 2 + // %1 = shl %a, 2 + // + // but EarlyCSE can do neither of them. + if (getOptLevel() != CodeGenOpt::None) + addEarlyCSEOrGVNPass(); } void AMDGPUPassConfig::addCodeGenPrepare() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); - if (ST.isPromoteAllocaEnabled()) { - addPass(createAMDGPUPromoteAlloca(ST)); - addPass(createSROAPass()); - } TargetPassConfig::addCodeGenPrepare(); + + if (EnableLoadStoreVectorizer) + addPass(createLoadStoreVectorizerPass()); } -bool -AMDGPUPassConfig::addPreISel() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); +bool AMDGPUPassConfig::addPreISel() { addPass(createFlattenCFGPass()); - if (ST.IsIRStructurizerEnabled()) - addPass(createStructurizeCFGPass()); return false; } @@ -244,7 +429,9 @@ bool AMDGPUPassConfig::addGCPasses() { bool R600PassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); - addPass(createR600TextureIntrinsicsReplacer()); + + if (EnableR600StructurizeCFG) + addPass(createStructurizeCFGPass()); return false; } @@ -253,9 +440,8 @@ void R600PassConfig::addPreRegAlloc() { } void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) + if (EnableR600IfConvert) addPass(&IfConverterID, false); addPass(createR600ClauseMergePass(*TM), false); } @@ -276,32 +462,62 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { // GCN Pass Setup //===----------------------------------------------------------------------===// +ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( + MachineSchedContext *C) const { + const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); + if (ST.enableSIScheduler()) + return createSIMachineScheduler(C); + return nullptr; +} + bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(&AMDGPUAnnotateKernelFeaturesID); - + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions addPass(createSinkingPass()); addPass(createSITypeRewriter()); - addPass(createSIAnnotateControlFlowPass()); addPass(createAMDGPUAnnotateUniformValues()); + addPass(createSIAnnotateControlFlowPass()); return false; } +void GCNPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + + // We want to fold operands after PeepholeOptimizer has run (or as part of + // it), because it will eliminate extra copies making it easier to fold the + // real source operand. We want to eliminate dead instructions after, so that + // we see fewer uses of the copies. We then need to clean up the dead + // instructions leftover after the operands are folded as well. + // + // XXX - Can we get away without running DeadMachineInstructionElim again? + addPass(&SIFoldOperandsID); + addPass(&DeadMachineInstructionElimID); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); addPass(&SIFixSGPRCopiesID); - addPass(createSIFoldOperandsPass()); return false; } -void GCNPassConfig::addPreRegAlloc() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); +#ifdef LLVM_BUILD_GLOBAL_ISEL +bool GCNPassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} +bool GCNPassConfig::addRegBankSelect() { + return false; +} +#endif + +void GCNPassConfig::addPreRegAlloc() { // This needs to be run directly before register allocation because // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass @@ -309,42 +525,48 @@ void GCNPassConfig::addPreRegAlloc() { insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + if (getOptLevel() > CodeGenOpt::None) { // Don't do this with no optimizations since it throws away debug info by // merging nonadjacent loads. // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. + + // FIXME: Move pre-RA and remove extra reg coalescer run. insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } - addPass(createSIShrinkInstructionsPass(), false); + + addPass(createSIShrinkInstructionsPass()); + addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { - addPass(&SIFixSGPRLiveRangesID); TargetPassConfig::addFastRegAlloc(RegAllocPass); } void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - // We want to run this after LiveVariables is computed to avoid computing them - // twice. - // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure - // that needs to be fixed. - insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false); TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } -void GCNPassConfig::addPostRegAlloc() { - addPass(createSIShrinkInstructionsPass(), false); -} - void GCNPassConfig::addPreSched2() { } void GCNPassConfig::addPreEmitPass() { - addPass(createSIInsertWaits(*TM), false); - addPass(createSILowerControlFlowPass(*TM), false); + // The hazard recognizer that runs as part of the post-ra scheduler does not + // guarantee to be able handle all hazards correctly. This is because if there + // are multiple scheduling regions in a basic block, the regions are scheduled + // bottom up, so when we begin to schedule a region we don't know what + // instructions were emitted directly before it. + // + // Here we add a stand-alone hazard recognizer pass which can handle all + // cases. + addPass(&PostRAHazardRecognizerID); + + addPass(createSIInsertWaitsPass()); + addPass(createSIShrinkInstructionsPass()); + addPass(createSILowerControlFlowPass()); + addPass(createSIDebuggerInsertNopsPass()); } TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 236e3f824030..b0eb3a9a15f7 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -12,15 +12,11 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETMACHINE_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H -#include "AMDGPUFrameLowering.h" -#include "AMDGPUInstrInfo.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "llvm/IR/DataLayout.h" namespace llvm { @@ -29,23 +25,23 @@ namespace llvm { //===----------------------------------------------------------------------===// class AMDGPUTargetMachine : public LLVMTargetMachine { -private: - protected: std::unique_ptr<TargetLoweringObjectFile> TLOF; - AMDGPUSubtarget Subtarget; AMDGPUIntrinsicInfo IntrinsicInfo; + StringRef getGPUName(const Function &F) const; + StringRef getFeatureString(const Function &F) const; + public: - AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); + AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, + CodeGenOpt::Level OL); ~AMDGPUTargetMachine(); - const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; } - const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override { - return &Subtarget; - } + const AMDGPUSubtarget *getSubtargetImpl() const; + const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override; + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { return &IntrinsicInfo; } @@ -60,30 +56,47 @@ public: // R600 Target Machine (R600 -> Cayman) //===----------------------------------------------------------------------===// -class R600TargetMachine : public AMDGPUTargetMachine { +class R600TargetMachine final : public AMDGPUTargetMachine { +private: + mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap; public: - R600TargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); + R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, + CodeGenOpt::Level OL); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + const R600Subtarget *getSubtargetImpl(const Function &) const override; }; //===----------------------------------------------------------------------===// // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// -class GCNTargetMachine : public AMDGPUTargetMachine { +class GCNTargetMachine final : public AMDGPUTargetMachine { +private: + mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap; public: - GCNTargetMachine(const Target &T, const Triple &TT, StringRef FS, - StringRef CPU, TargetOptions Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL); + GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, TargetOptions Options, + Optional<Reloc::Model> RM, CodeModel::Model CM, + CodeGenOpt::Level OL); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + const SISubtarget *getSubtargetImpl(const Function &) const override; }; +inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl( + const Function &F) const { + if (getTargetTriple().getArch() == Triple::amdgcn) + return static_cast<const GCNTargetMachine *>(this)->getSubtargetImpl(F); + return static_cast<const R600TargetMachine *>(this)->getSubtargetImpl(F); +} + } // End namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp index e050f21091ba..03d1e2c764de 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -29,59 +29,3 @@ MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); } - -//===----------------------------------------------------------------------===// -// HSA Object File -//===----------------------------------------------------------------------===// - - -void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM){ - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); - - TextSection = AMDGPU::getHSATextSection(Ctx); - - DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx); - DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx); - - RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx); -} - -bool AMDGPUHSATargetObjectFile::isAgentAllocationSection( - const char *SectionName) const { - return cast<MCSectionELF>(DataGlobalAgentSection) - ->getSectionName() - .equals(SectionName); -} - -bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const { - // Read-only segments can only have agent allocation. - return AMDGPU::isReadOnlySegment(GV) || - (AMDGPU::isGlobalSegment(GV) && GV->hasSection() && - isAgentAllocationSection(GV->getSection())); -} - -bool AMDGPUHSATargetObjectFile::isProgramAllocation( - const GlobalValue *GV) const { - // The default for global segments is program allocation. - return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV); -} - -MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( - const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const { - if (Kind.isText() && !GV->hasComdat()) - return getTextSection(); - - if (AMDGPU::isGlobalSegment(GV)) { - if (isAgentAllocation(GV)) - return DataGlobalAgentSection; - - if (isProgramAllocation(GV)) - return DataGlobalProgramSection; - } - - return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM); -} diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h index 921341ebb897..f530e0952a74 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h +++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -28,24 +28,6 @@ class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { const TargetMachine &TM) const override; }; -class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile { -private: - MCSection *DataGlobalAgentSection; - MCSection *DataGlobalProgramSection; - MCSection *RodataReadonlyAgentSection; - - bool isAgentAllocationSection(const char *SectionName) const; - bool isAgentAllocation(const GlobalValue *GV) const; - bool isProgramAllocation(const GlobalValue *GV) const; - -public: - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - - MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const override; -}; - } // end namespace llvm #endif diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 54a003d6a9cf..3d630fe3ea9d 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" @@ -28,6 +29,7 @@ using namespace llvm; #define DEBUG_TYPE "AMDGPUtti" + void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP) { UP.Threshold = 300; // Twice the default. @@ -78,11 +80,127 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { return Vector ? 0 : 32; } +unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) { + switch (AddrSpace) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + return 128; + case AMDGPUAS::LOCAL_ADDRESS: + case AMDGPUAS::REGION_ADDRESS: + return 64; + case AMDGPUAS::PRIVATE_ADDRESS: + return 8 * ST->getMaxPrivateElementSize(); + default: + if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && + (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS || + AddrSpace == AMDGPUAS::PARAM_I_ADDRESS || + (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 && + AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15))) + return 128; + llvm_unreachable("unhandled address space"); + } +} + unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } +int AMDGPUTTIImpl::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, + TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, + TTI::OperandValueProperties Opd2PropInfo) { + + EVT OrigTy = TLI->getValueType(DL, Ty); + if (!OrigTy.isSimple()) { + return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); + } + + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + int ISD = TLI->InstructionOpcodeToISD(Opcode); + + // Because we don't have any legal vector operations, but the legal types, we + // need to account for split vectors. + unsigned NElts = LT.second.isVector() ? + LT.second.getVectorNumElements() : 1; + + MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; + + switch (ISD) { + case ISD::SHL: + case ISD::SRL: + case ISD::SRA: { + if (SLT == MVT::i64) + return get64BitInstrCost() * LT.first * NElts; + + // i32 + return getFullRateInstrCost() * LT.first * NElts; + } + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + if (SLT == MVT::i64){ + // and, or and xor are typically split into 2 VALU instructions. + return 2 * getFullRateInstrCost() * LT.first * NElts; + } + + return LT.first * NElts * getFullRateInstrCost(); + } + case ISD::MUL: { + const int QuarterRateCost = getQuarterRateInstrCost(); + if (SLT == MVT::i64) { + const int FullRateCost = getFullRateInstrCost(); + return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts; + } + + // i32 + return QuarterRateCost * NElts * LT.first; + } + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + if (SLT == MVT::f64) + return LT.first * NElts * get64BitInstrCost(); + + if (SLT == MVT::f32 || SLT == MVT::f16) + return LT.first * NElts * getFullRateInstrCost(); + break; + + case ISD::FDIV: + case ISD::FREM: + // FIXME: frem should be handled separately. The fdiv in it is most of it, + // but the current lowering is also not entirely correct. + if (SLT == MVT::f64) { + int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); + + // Add cost of workaround. + if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + Cost += 3 * getFullRateInstrCost(); + + return LT.first * Cost * NElts; + } + + // Assuming no fp32 denormals lowering. + if (SLT == MVT::f32 || SLT == MVT::f16) { + assert(!ST->hasFP32Denormals() && "will change when supported"); + int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); + return LT.first * NElts * Cost; + } + + break; + default: + break; + } + + return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); +} + unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { // XXX - For some reason this isn't called for switch. switch (Opcode) { @@ -98,6 +216,11 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: + case Instruction::InsertElement: + // Extracts are just reads of a subregister, so are free. Inserts are + // considered free because we don't want to have any cost for scalarizing + // operations, and we don't have to copy into a different register class. + // Dynamic indexing isn't free and is best avoided. return Index == ~0u ? 2 : 0; default: @@ -115,6 +238,9 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, // IntrinsicsAMDGPU.td break; + case Intrinsic::amdgcn_workitem_id_x: + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_mbcnt_hi: @@ -122,6 +248,31 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, case Intrinsic::r600_read_tidig_x: case Intrinsic::r600_read_tidig_y: case Intrinsic::r600_read_tidig_z: + case Intrinsic::amdgcn_image_atomic_swap: + case Intrinsic::amdgcn_image_atomic_add: + case Intrinsic::amdgcn_image_atomic_sub: + case Intrinsic::amdgcn_image_atomic_smin: + case Intrinsic::amdgcn_image_atomic_umin: + case Intrinsic::amdgcn_image_atomic_smax: + case Intrinsic::amdgcn_image_atomic_umax: + case Intrinsic::amdgcn_image_atomic_and: + case Intrinsic::amdgcn_image_atomic_or: + case Intrinsic::amdgcn_image_atomic_xor: + case Intrinsic::amdgcn_image_atomic_inc: + case Intrinsic::amdgcn_image_atomic_dec: + case Intrinsic::amdgcn_image_atomic_cmpswap: + case Intrinsic::amdgcn_buffer_atomic_swap: + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_smin: + case Intrinsic::amdgcn_buffer_atomic_umin: + case Intrinsic::amdgcn_buffer_atomic_smax: + case Intrinsic::amdgcn_buffer_atomic_umax: + case Intrinsic::amdgcn_buffer_atomic_and: + case Intrinsic::amdgcn_buffer_atomic_or: + case Intrinsic::amdgcn_buffer_atomic_xor: + case Intrinsic::amdgcn_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_ps_live: return true; } @@ -129,18 +280,17 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { default: return false; - case AMDGPUIntrinsic::SI_tid: case AMDGPUIntrinsic::SI_fs_interp: + case AMDGPUIntrinsic::SI_fs_constant: return true; } } static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); - unsigned ShaderType = AMDGPU::getShaderType(*F); // Arguments to compute shaders are never a source of divergence. - if (ShaderType == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F->getCallingConv())) return true; // For non-compute shaders, SGPR inputs are marked with either inreg or byval. @@ -169,6 +319,13 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { if (const LoadInst *Load = dyn_cast<LoadInst>(V)) return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + // Atomics are divergent because they are executed sequentially: when an + // atomic operation refers to the same address in each thread, then each + // thread after the first sees the value written by the previous thread as + // original value. + if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V)) + return true; + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { const TargetMachine &TM = getTLI()->getTargetMachine(); return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 976afb03443b..a82a07458086 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -14,18 +14,18 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H -#define LLVM_LIB_TARGET_R600_AMDGPUTARGETTRANSFORMINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H #include "AMDGPU.h" #include "AMDGPUTargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" namespace llvm { +class AMDGPUTargetLowering; -class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { +class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> { typedef BasicTTIImplBase<AMDGPUTTIImpl> BaseT; typedef TargetTransformInfo TTI; friend BaseT; @@ -36,10 +36,33 @@ class AMDGPUTTIImpl : public BasicTTIImplBase<AMDGPUTTIImpl> { const AMDGPUSubtarget *getST() const { return ST; } const AMDGPUTargetLowering *getTLI() const { return TLI; } + + static inline int getFullRateInstrCost() { + return TargetTransformInfo::TCC_Basic; + } + + static inline int getHalfRateInstrCost() { + return 2 * TargetTransformInfo::TCC_Basic; + } + + // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe + // should be 2 or 4. + static inline int getQuarterRateInstrCost() { + return 3 * TargetTransformInfo::TCC_Basic; + } + + // On some parts, normal fp64 operations are half rate, and others + // quarter. This also applies to some integer operations. + inline int get64BitInstrCost() const { + return ST->hasHalfRate64Ops() ? + getHalfRateInstrCost() : getQuarterRateInstrCost(); + } + public: - explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL) - : BaseT(TM, DL), ST(TM->getSubtargetImpl()), - TLI(ST->getTargetLowering()) {} + explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} // Provide value semantics. MSVC requires that we spell all of these out. AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg) @@ -54,17 +77,27 @@ public: TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - return ST->hasBCNT(TyWidth) ? TTI::PSK_FastHardware : TTI::PSK_Software; + return TTI::PSK_FastHardware; } unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace); unsigned getMaxInterleaveFactor(unsigned VF); + int getArithmeticInstrCost( + unsigned Opcode, Type *Ty, + TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, + TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, + TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, + TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); + unsigned getCFInstrCost(unsigned Opcode); int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; + + unsigned getVectorSplitCost() { return 0; } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 917efd149e00..21de76396b16 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -50,8 +50,6 @@ STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " "matched"); STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " "matched"); -STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " - "pattern matched"); STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); @@ -162,7 +160,7 @@ public: bool prepare(); bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + TII = MF.getSubtarget<R600Subtarget>().getInstrInfo(); TRI = &TII->getRegisterInfo(); DEBUG(MF.dump();); OrderedBlks.clear(); @@ -213,7 +211,6 @@ protected: int getSCCNum(MachineBasicBlock *MBB) const; MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const; bool hasBackEdge(MachineBasicBlock *MBB) const; - static unsigned getLoopDepth(MachineLoop *LoopRep); bool isRetiredBlock(MachineBasicBlock *MBB) const; bool isActiveLoophead(MachineBasicBlock *MBB) const; PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB, @@ -229,16 +226,15 @@ protected: // Function originally from CFGStructTraits void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); + const DebugLoc &DL = DebugLoc()); MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode, - DebugLoc DL = DebugLoc()); + const DebugLoc &DL = DebugLoc()); MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode); void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode, - DebugLoc DL); + const DebugLoc &DL); void insertCondBranchBefore(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL); - void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum); + MachineBasicBlock::iterator I, int NewOpcode, + int RegNum, const DebugLoc &DL); static int getBranchNzeroOpcode(int OldOpcode); static int getBranchZeroOpcode(int OldOpcode); static int getContinueNzeroOpcode(int OldOpcode); @@ -257,7 +253,6 @@ protected: /// instruction. Such move instruction "belong to" the loop backward-edge. MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB); static MachineInstr *getReturnInstr(MachineBasicBlock *MBB); - static MachineInstr *getContinueInstr(MachineBasicBlock *MBB); static bool isReturnBlock(MachineBasicBlock *MBB); static void cloneSuccessorList(MachineBasicBlock *DstMBB, MachineBasicBlock *SrcMBB) ; @@ -276,11 +271,7 @@ protected: int ifPatternMatch(MachineBasicBlock *MBB); int loopendPatternMatch(); int mergeLoop(MachineLoop *LoopRep); - int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader); - void handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop); /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in /// the same loop with LoopLandInfo without explicitly keeping track of /// loopContBlks and loopBreakBlks, this is a method to get the information. @@ -337,13 +328,7 @@ protected: MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I); void recordSccnum(MachineBasicBlock *MBB, int SCCNum); void retireBlock(MachineBasicBlock *MBB); - void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr); - MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&); - /// This is work around solution for findNearestCommonDominator not available - /// to post dom a proper fix should go to Dominators.h. - MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2); private: MBBInfoMap BlockInfoMap; @@ -376,10 +361,6 @@ bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const { return MBB->isSuccessor(LoopHeader); } -unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) { - return LoopRep ? LoopRep->getLoopDepth() : 0; -} - bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const { MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB); if (It == BlockInfoMap.end()) @@ -442,7 +423,8 @@ bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const { void AMDGPUCFGStructurizer::reversePredicateSetter( MachineBasicBlock::iterator I) { - while (I--) { + assert(static_cast<MachineInstr *>(I) && "Expected valid iterator"); + for (;; --I) { if (I->getOpcode() == AMDGPU::PRED_X) { switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) { case OPCODE_IS_ZERO_INT: @@ -469,16 +451,17 @@ void AMDGPUCFGStructurizer::reversePredicateSetter( } void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { - MachineInstr *MI = MBB->getParent() - ->CreateMachineInstr(TII->get(NewOpcode), DL); + int NewOpcode, const DebugLoc &DL) { + MachineInstr *MI = + MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); MBB->push_back(MI); //assume the instruction doesn't take any reg operand ... SHOWNEWINSTR(MI); } MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB, - int NewOpcode, DebugLoc DL) { + int NewOpcode, + const DebugLoc &DL) { MachineInstr *MI = MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL); if (MBB->begin() != MBB->end()) @@ -502,7 +485,7 @@ MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore( } void AMDGPUCFGStructurizer::insertCondBranchBefore( - MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) { + MachineBasicBlock::iterator I, int NewOpcode, const DebugLoc &DL) { MachineInstr *OldMI = &(*I); MachineBasicBlock *MBB = OldMI->getParent(); MachineFunction *MF = MBB->getParent(); @@ -514,9 +497,9 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore( //erase later oldInstr->eraseFromParent(); } -void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, - MachineBasicBlock::iterator I, int NewOpcode, int RegNum, - DebugLoc DL) { +void AMDGPUCFGStructurizer::insertCondBranchBefore( + MachineBasicBlock *blk, MachineBasicBlock::iterator I, int NewOpcode, + int RegNum, const DebugLoc &DL) { MachineFunction *MF = blk->getParent(); MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL); //insert before @@ -525,16 +508,6 @@ void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk, SHOWNEWINSTR(NewInstr); } -void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB, - int NewOpcode, int RegNum) { - MachineFunction *MF = MBB->getParent(); - MachineInstr *NewInstr = - MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc()); - MBB->push_back(NewInstr); - MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false); - SHOWNEWINSTR(NewInstr); -} - int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { switch(OldOpcode) { case AMDGPU::JUMP_COND: @@ -664,16 +637,6 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) { return nullptr; } -MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) { - MachineBasicBlock::reverse_iterator It = MBB->rbegin(); - if (It != MBB->rend()) { - MachineInstr *MI = &(*It); - if (MI->getOpcode() == AMDGPU::CONTINUE) - return MI; - } - return nullptr; -} - bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) { MachineInstr *MI = getReturnInstr(MBB); bool IsReturn = (MBB->succ_size() == 0); @@ -697,11 +660,8 @@ MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) { MachineFunction *Func = MBB->getParent(); MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock(); Func->push_back(NewMBB); //insert to function - for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end(); - It != E; ++It) { - MachineInstr *MI = Func->CloneMachineInstr(It); - NewMBB->push_back(MI); - } + for (const MachineInstr &It : *MBB) + NewMBB->push_back(Func->CloneMachineInstr(&It)); return NewMBB; } @@ -727,7 +687,7 @@ void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) { while (It != E) { if (Pre->getOpcode() == AMDGPU::CONTINUE && It->getOpcode() == AMDGPU::ENDLOOP) - ContInstr.push_back(Pre); + ContInstr.push_back(&*Pre); Pre = It; ++It; } @@ -923,7 +883,7 @@ bool AMDGPUCFGStructurizer::run() { if (!Finish) { DEBUG(FuncRep->viewCFG()); - llvm_unreachable("IRREDUCIBLE_CFG"); + report_fatal_error("IRREDUCIBLE_CFG"); } return true; @@ -1145,34 +1105,6 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) { return 1; } -int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, - MachineBasicBlock *LoopHeader) { - int NumCont = 0; - SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB; - typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM; - GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader), - E = GTIM::child_end(LoopHeader); - for (; It != E; ++It) { - MachineBasicBlock *MBB = *It; - if (LoopRep->contains(MBB)) { - handleLoopcontBlock(MBB, MLI->getLoopFor(MBB), - LoopHeader, LoopRep); - ContMBB.push_back(MBB); - ++NumCont; - } - } - - for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(), - E = ContMBB.end(); It != E; ++It) { - (*It)->removeSuccessor(LoopHeader, true); - } - - numLoopcontPatternMatch += NumCont; - - return NumCont; -} - - bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak( MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) { if (Src1MBB->succ_size() == 0) { @@ -1413,10 +1345,10 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); if (LandBlkHasOtherPred) { - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); unsigned CmpResReg = HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - llvm_unreachable("Extra compare instruction needed to handle CFG"); + report_fatal_error("Extra compare instruction needed to handle CFG"); insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, CmpResReg, DebugLoc()); } @@ -1433,7 +1365,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // need to uncondionally insert the assignment to ensure a path from its // predecessor rather than headBlk has valid value in initReg if // (initVal != 1). - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); } insertInstrBefore(I, AMDGPU::ELSE); @@ -1442,7 +1374,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // need to uncondionally insert the assignment to ensure a path from its // predecessor rather than headBlk has valid value in initReg if // (initVal != 0) - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); } if (LandBlkHasOtherPred) { @@ -1454,7 +1386,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, PE = LandBlk->pred_end(); PI != PE; ++PI) { MachineBasicBlock *MBB = *PI; if (MBB != TrueMBB && MBB != FalseMBB) - llvm_unreachable("Extra register needed to handle CFG"); + report_fatal_error("Extra register needed to handle CFG"); } } DEBUG( @@ -1468,17 +1400,6 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, return NumNewBlk; } -void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB, - MachineLoop *ContingLoop, MachineBasicBlock *ContMBB, - MachineLoop *ContLoop) { - DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber() - << " header = BB" << ContMBB->getNumber() << "\n"; - dbgs() << "Trying to continue loop-depth = " - << getLoopDepth(ContLoop) - << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";); - settleLoopcontBlock(ContingMBB, ContMBB); -} - void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, MachineBasicBlock *SrcMBB) { DEBUG( @@ -1809,76 +1730,6 @@ void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) { && "can't retire block yet"); } -void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep, - MachineBasicBlock *MBB) { - MachineBasicBlock *&TheEntry = LLInfoMap[loopRep]; - if (!MBB) { - MBB = FuncRep->CreateMachineBasicBlock(); - FuncRep->push_back(MBB); //insert to function - SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: "); - } - TheEntry = MBB; - DEBUG( - dbgs() << "setLoopLandBlock loop-header = BB" - << loopRep->getHeader()->getNumber() - << " landing-block = BB" << MBB->getNumber() << "\n"; - ); -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1, - MachineBasicBlock *MBB2) { - - if (PDT->dominates(MBB1, MBB2)) - return MBB1; - if (PDT->dominates(MBB2, MBB1)) - return MBB2; - - MachineDomTreeNode *Node1 = PDT->getNode(MBB1); - MachineDomTreeNode *Node2 = PDT->getNode(MBB2); - - // Handle newly cloned node. - if (!Node1 && MBB1->succ_size() == 1) - return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2); - if (!Node2 && MBB2->succ_size() == 1) - return findNearestCommonPostDom(MBB1, *MBB2->succ_begin()); - - if (!Node1 || !Node2) - return nullptr; - - Node1 = Node1->getIDom(); - while (Node1) { - if (PDT->dominates(Node1, Node2)) - return Node1->getBlock(); - Node1 = Node1->getIDom(); - } - - return nullptr; -} - -MachineBasicBlock * -AMDGPUCFGStructurizer::findNearestCommonPostDom( - std::set<MachineBasicBlock *> &MBBs) { - MachineBasicBlock *CommonDom; - std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin(); - std::set<MachineBasicBlock *>::const_iterator E = MBBs.end(); - for (CommonDom = *It; It != E && CommonDom; ++It) { - MachineBasicBlock *MBB = *It; - if (MBB != CommonDom) - CommonDom = findNearestCommonPostDom(MBB, CommonDom); - } - - DEBUG( - dbgs() << "Common post dominator for exit blocks is "; - if (CommonDom) - dbgs() << "BB" << CommonDom->getNumber() << "\n"; - else - dbgs() << "NULL\n"; - ); - - return CommonDom; -} - char AMDGPUCFGStructurizer::ID = 0; } // end anonymous namespace diff --git a/lib/Target/AMDGPU/AMDKernelCodeT.h b/lib/Target/AMDGPU/AMDKernelCodeT.h index a9ba60c8cbad..5d243e949fd3 100644 --- a/lib/Target/AMDGPU/AMDKernelCodeT.h +++ b/lib/Target/AMDGPU/AMDKernelCodeT.h @@ -44,6 +44,15 @@ enum amd_code_version_t { AMD_CODE_VERSION_MINOR = 1 }; +// Sets val bits for specified mask in specified dst packed instance. +#define AMD_HSA_BITS_SET(dst, mask, val) \ + dst &= (~(1 << mask ## _SHIFT) & ~mask); \ + dst |= (((val) << mask ## _SHIFT) & mask) + +// Gets bits for specified mask from specified src packed instance. +#define AMD_HSA_BITS_GET(src, mask) \ + ((src & mask) >> mask ## _SHIFT) \ + /// The values used to define the number of bytes to use for the /// swizzle element size. enum amd_element_byte_size_t { @@ -118,10 +127,14 @@ enum amd_code_property_mask_t { AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, + AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10, + AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6, + AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT, + /// Control wave ID base counter for GDS ordered-append. Used to set /// COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if /// ORDERED_APPEND_MODE also needs to be settable) - AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 10, + AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16, AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT, @@ -146,7 +159,7 @@ enum amd_code_property_mask_t { /// is generally DWORD. /// /// uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM. - AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 11, + AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, @@ -155,7 +168,7 @@ enum amd_code_property_mask_t { /// HSA_MACHINE_LARGE. Must also match /// SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), /// SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+). - AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 13, + AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19, AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, @@ -167,18 +180,22 @@ enum amd_code_property_mask_t { /// workitem_private_segment_byte_size only specifies the statically /// know private segment size, and additional space must be added /// for the call stack. - AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 14, + AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, /// Indicate if code generated has support for debugging. - AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21, AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 15, + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22, AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, - AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT + AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT, + + AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23, + AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9, + AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT }; /// @brief The hsa_ext_control_directives_t specifies the values for the HSAIL diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index d9f753f40133..efcf1b23adaa 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1,4 +1,4 @@ -//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ----------===// +//===-- AMDGPUAsmParser.cpp - Parse SI asm to MCInst instructions ---------===// // // The LLVM Compiler Infrastructure // @@ -7,15 +7,17 @@ // //===----------------------------------------------------------------------===// +#include "AMDKernelCodeT.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "MCTargetDesc/AMDGPUTargetStreamer.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "AMDKernelCodeT.h" #include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "Utils/AMDKernelCodeTUtils.h" +#include "Utils/AMDGPUAsmUtils.h" #include "llvm/ADT/APFloat.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" @@ -25,16 +27,17 @@ #include "llvm/MC/MCParser/MCAsmLexer.h" #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" using namespace llvm; @@ -42,6 +45,8 @@ namespace { struct OptionalOperand; +enum RegisterKind { IS_UNKNOWN, IS_VGPR, IS_SGPR, IS_TTMP, IS_SPECIAL }; + class AMDGPUOperand : public MCParsedAsmOperand { enum KindTy { Token, @@ -55,19 +60,74 @@ class AMDGPUOperand : public MCParsedAsmOperand { public: AMDGPUOperand(enum KindTy K) : MCParsedAsmOperand(), Kind(K) {} - MCContext *Ctx; + typedef std::unique_ptr<AMDGPUOperand> Ptr; + + struct Modifiers { + bool Abs; + bool Neg; + bool Sext; + + bool hasFPModifiers() const { return Abs || Neg; } + bool hasIntModifiers() const { return Sext; } + bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); } + + int64_t getFPModifiersOperand() const { + int64_t Operand = 0; + Operand |= Abs ? SISrcMods::ABS : 0; + Operand |= Neg ? SISrcMods::NEG : 0; + return Operand; + } + + int64_t getIntModifiersOperand() const { + int64_t Operand = 0; + Operand |= Sext ? SISrcMods::SEXT : 0; + return Operand; + } + + int64_t getModifiersOperand() const { + assert(!(hasFPModifiers() && hasIntModifiers()) + && "fp and int modifiers should not be used simultaneously"); + if (hasFPModifiers()) { + return getFPModifiersOperand(); + } else if (hasIntModifiers()) { + return getIntModifiersOperand(); + } else { + return 0; + } + } + + friend raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods); + }; enum ImmTy { ImmTyNone, - ImmTyDSOffset0, - ImmTyDSOffset1, ImmTyGDS, + ImmTyOffen, + ImmTyIdxen, + ImmTyAddr64, ImmTyOffset, + ImmTyOffset0, + ImmTyOffset1, ImmTyGLC, ImmTySLC, ImmTyTFE, - ImmTyClamp, - ImmTyOMod + ImmTyClampSI, + ImmTyOModSI, + ImmTyDppCtrl, + ImmTyDppRowMask, + ImmTyDppBankMask, + ImmTyDppBoundCtrl, + ImmTySdwaDstSel, + ImmTySdwaSrc0Sel, + ImmTySdwaSrc1Sel, + ImmTySdwaDstUnused, + ImmTyDMask, + ImmTyUNorm, + ImmTyDA, + ImmTyR128, + ImmTyLWE, + ImmTyHwreg, + ImmTySendMsg, }; struct TokOp { @@ -79,11 +139,12 @@ public: bool IsFPImm; ImmTy Type; int64_t Val; + Modifiers Mods; }; struct RegOp { unsigned RegNo; - int Modifiers; + Modifiers Mods; const MCRegisterInfo *TRI; const MCSubtargetInfo *STI; bool IsForcedVOP3; @@ -96,175 +157,323 @@ public: const MCExpr *Expr; }; - void addImmOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm(getImm())); + bool isToken() const override { + if (Kind == Token) + return true; + + if (Kind != Expression || !Expr) + return false; + + // When parsing operands, we can't always tell if something was meant to be + // a token, like 'gds', or an expression that references a global variable. + // In this case, we assume the string is an expression, and if we need to + // interpret is a token, then we treat the symbol name as the token. + return isa<MCSymbolRefExpr>(Expr); } - StringRef getToken() const { - return StringRef(Tok.Data, Tok.Length); + bool isImm() const override { + return Kind == Immediate; } - void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); + bool isInlinableImm() const { + if (!isImmTy(ImmTyNone)) { + // Only plain immediates are inlinable (e.g. "clamp" attribute is not) + return false; + } + // TODO: We should avoid using host float here. It would be better to + // check the float bit values which is what a few other places do. + // We've had bot failures before due to weird NaN support on mips hosts. + const float F = BitsToFloat(Imm.Val); + // TODO: Add 1/(2*pi) for VI + return (Imm.Val <= 64 && Imm.Val >= -16) || + (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || + F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0); } - void addRegOrImmOperands(MCInst &Inst, unsigned N) const { - if (isReg()) - addRegOperands(Inst, N); - else - addImmOperands(Inst, N); + bool isRegKind() const { + return Kind == Register; } - void addRegWithInputModsOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createImm( - Reg.Modifiers == -1 ? 0 : Reg.Modifiers)); - addRegOperands(Inst, N); + bool isReg() const override { + return isRegKind() && !Reg.Mods.hasModifiers(); + } + + bool isRegOrImmWithInputMods() const { + return isRegKind() || isInlinableImm(); + } + + bool isImmTy(ImmTy ImmT) const { + return isImm() && Imm.Type == ImmT; + } + + bool isImmModifier() const { + return isImm() && Imm.Type != ImmTyNone; + } + + bool isClampSI() const { return isImmTy(ImmTyClampSI); } + bool isOModSI() const { return isImmTy(ImmTyOModSI); } + bool isDMask() const { return isImmTy(ImmTyDMask); } + bool isUNorm() const { return isImmTy(ImmTyUNorm); } + bool isDA() const { return isImmTy(ImmTyDA); } + bool isR128() const { return isImmTy(ImmTyUNorm); } + bool isLWE() const { return isImmTy(ImmTyLWE); } + bool isOffen() const { return isImmTy(ImmTyOffen); } + bool isIdxen() const { return isImmTy(ImmTyIdxen); } + bool isAddr64() const { return isImmTy(ImmTyAddr64); } + bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); } + bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<16>(getImm()); } + bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); } + bool isGDS() const { return isImmTy(ImmTyGDS); } + bool isGLC() const { return isImmTy(ImmTyGLC); } + bool isSLC() const { return isImmTy(ImmTySLC); } + bool isTFE() const { return isImmTy(ImmTyTFE); } + bool isBankMask() const { return isImmTy(ImmTyDppBankMask); } + bool isRowMask() const { return isImmTy(ImmTyDppRowMask); } + bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); } + bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); } + bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); } + bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); } + bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); } + + bool isMod() const { + return isClampSI() || isOModSI(); } - void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { - if (isImm()) - addImmOperands(Inst, N); - else { - assert(isExpr()); - Inst.addOperand(MCOperand::createExpr(Expr)); - } + bool isRegOrImm() const { + return isReg() || isImm(); } - bool defaultTokenHasSuffix() const { - StringRef Token(Tok.Data, Tok.Length); + bool isRegClass(unsigned RCID) const { + return isReg() && Reg.TRI->getRegClass(RCID).contains(getReg()); + } - return Token.endswith("_e32") || Token.endswith("_e64"); + bool isSCSrc32() const { + return isInlinableImm() || isRegClass(AMDGPU::SReg_32RegClassID); } - bool isToken() const override { - return Kind == Token; + bool isSCSrc64() const { + return isInlinableImm() || isRegClass(AMDGPU::SReg_64RegClassID); } - bool isImm() const override { - return Kind == Immediate; + bool isSSrc32() const { + return isImm() || isSCSrc32() || isExpr(); } - bool isInlineImm() const { - float F = BitsToFloat(Imm.Val); - // TODO: Add 0.5pi for VI - return isImm() && ((Imm.Val <= 64 && Imm.Val >= -16) || - (F == 0.0 || F == 0.5 || F == -0.5 || F == 1.0 || F == -1.0 || - F == 2.0 || F == -2.0 || F == 4.0 || F == -4.0)); + bool isSSrc64() const { + // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits. + // See isVSrc64(). + return isImm() || isSCSrc64(); } - bool isDSOffset0() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset0; + bool isVCSrc32() const { + return isInlinableImm() || isRegClass(AMDGPU::VS_32RegClassID); } - bool isDSOffset1() const { - assert(isImm()); - return Imm.Type == ImmTyDSOffset1; + bool isVCSrc64() const { + return isInlinableImm() || isRegClass(AMDGPU::VS_64RegClassID); } - int64_t getImm() const { - return Imm.Val; + bool isVSrc32() const { + return isImm() || isVCSrc32(); } - enum ImmTy getImmTy() const { - assert(isImm()); - return Imm.Type; + bool isVSrc64() const { + // TODO: Check if the 64-bit value (coming from assembly source) can be + // narrowed to 32 bits (in the instruction stream). That require knowledge + // of instruction type (unsigned/signed, floating or "untyped"/B64), + // see [AMD GCN3 ISA 6.3.1]. + // TODO: How 64-bit values are formed from 32-bit literals in _B64 insns? + return isImm() || isVCSrc64(); } - bool isRegKind() const { - return Kind == Register; + bool isMem() const override { + return false; } - bool isReg() const override { - return Kind == Register && Reg.Modifiers == -1; + bool isExpr() const { + return Kind == Expression; } - bool isRegWithInputMods() const { - return Kind == Register && (Reg.IsForcedVOP3 || Reg.Modifiers != -1); + bool isSoppBrTarget() const { + return isExpr() || isImm(); } - void setModifiers(unsigned Mods) { - assert(isReg()); - Reg.Modifiers = Mods; + bool isSWaitCnt() const; + bool isHwreg() const; + bool isSendMsg() const; + bool isSMRDOffset() const; + bool isSMRDLiteralOffset() const; + bool isDPPCtrl() const; + + StringRef getExpressionAsToken() const { + assert(isExpr()); + const MCSymbolRefExpr *S = cast<MCSymbolRefExpr>(Expr); + return S->getSymbol().getName(); } - bool hasModifiers() const { - assert(isRegKind()); - return Reg.Modifiers != -1; + + StringRef getToken() const { + assert(isToken()); + + if (Kind == Expression) + return getExpressionAsToken(); + + return StringRef(Tok.Data, Tok.Length); + } + + int64_t getImm() const { + assert(isImm()); + return Imm.Val; + } + + enum ImmTy getImmTy() const { + assert(isImm()); + return Imm.Type; } unsigned getReg() const override { return Reg.RegNo; } - bool isRegOrImm() const { - return isReg() || isImm(); + SMLoc getStartLoc() const override { + return StartLoc; } - bool isRegClass(unsigned RCID) const { - return Reg.TRI->getRegClass(RCID).contains(getReg()); + SMLoc getEndLoc() const override { + return EndLoc; } - bool isSCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + Modifiers getModifiers() const { + assert(isRegKind() || isImmTy(ImmTyNone)); + return isRegKind() ? Reg.Mods : Imm.Mods; } - bool isSSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::SReg_32RegClassID)); + void setModifiers(Modifiers Mods) { + assert(isRegKind() || isImmTy(ImmTyNone)); + if (isRegKind()) + Reg.Mods = Mods; + else + Imm.Mods = Mods; } - bool isSSrc64() const { - return isImm() || isInlineImm() || - (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); + bool hasModifiers() const { + return getModifiers().hasModifiers(); } - - bool isSCSrc64() const { - return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm(); + + bool hasFPModifiers() const { + return getModifiers().hasFPModifiers(); } - bool isVCSrc32() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + bool hasIntModifiers() const { + return getModifiers().hasIntModifiers(); } - bool isVCSrc64() const { - return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const { + if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers()) { + // Apply modifiers to immediate value + int64_t Val = Imm.Val; + bool Negate = Imm.Mods.Neg; // Only negate can get here + if (Imm.IsFPImm) { + APFloat F(BitsToFloat(Val)); + if (Negate) { + F.changeSign(); + } + Val = F.bitcastToAPInt().getZExtValue(); + } else { + Val = Negate ? -Val : Val; + } + Inst.addOperand(MCOperand::createImm(Val)); + } else { + Inst.addOperand(MCOperand::createImm(getImm())); + } } - bool isVSrc32() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); + void addRegOperands(MCInst &Inst, unsigned N) const { + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); } - bool isVSrc64() const { - return isImm() || (isReg() && isRegClass(AMDGPU::VS_64RegClassID)); + void addRegOrImmOperands(MCInst &Inst, unsigned N) const { + if (isRegKind()) + addRegOperands(Inst, N); + else if (isExpr()) + Inst.addOperand(MCOperand::createExpr(Expr)); + else + addImmOperands(Inst, N); } - bool isMem() const override { - return false; + void addRegOrImmWithInputModsOperands(MCInst &Inst, unsigned N) const { + Modifiers Mods = getModifiers(); + Inst.addOperand(MCOperand::createImm(Mods.getModifiersOperand())); + if (isRegKind()) { + addRegOperands(Inst, N); + } else { + addImmOperands(Inst, N, false); + } } - bool isExpr() const { - return Kind == Expression; + void addRegOrImmWithFPInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasIntModifiers()); + addRegOrImmWithInputModsOperands(Inst, N); } - bool isSoppBrTarget() const { - return isExpr() || isImm(); + void addRegOrImmWithIntInputModsOperands(MCInst &Inst, unsigned N) const { + assert(!hasFPModifiers()); + addRegOrImmWithInputModsOperands(Inst, N); } - SMLoc getStartLoc() const override { - return StartLoc; + void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const { + if (isImm()) + addImmOperands(Inst, N); + else { + assert(isExpr()); + Inst.addOperand(MCOperand::createExpr(Expr)); + } } - SMLoc getEndLoc() const override { - return EndLoc; + void printImmTy(raw_ostream& OS, ImmTy Type) const { + switch (Type) { + case ImmTyNone: OS << "None"; break; + case ImmTyGDS: OS << "GDS"; break; + case ImmTyOffen: OS << "Offen"; break; + case ImmTyIdxen: OS << "Idxen"; break; + case ImmTyAddr64: OS << "Addr64"; break; + case ImmTyOffset: OS << "Offset"; break; + case ImmTyOffset0: OS << "Offset0"; break; + case ImmTyOffset1: OS << "Offset1"; break; + case ImmTyGLC: OS << "GLC"; break; + case ImmTySLC: OS << "SLC"; break; + case ImmTyTFE: OS << "TFE"; break; + case ImmTyClampSI: OS << "ClampSI"; break; + case ImmTyOModSI: OS << "OModSI"; break; + case ImmTyDppCtrl: OS << "DppCtrl"; break; + case ImmTyDppRowMask: OS << "DppRowMask"; break; + case ImmTyDppBankMask: OS << "DppBankMask"; break; + case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break; + case ImmTySdwaDstSel: OS << "SdwaDstSel"; break; + case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break; + case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break; + case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break; + case ImmTyDMask: OS << "DMask"; break; + case ImmTyUNorm: OS << "UNorm"; break; + case ImmTyDA: OS << "DA"; break; + case ImmTyR128: OS << "R128"; break; + case ImmTyLWE: OS << "LWE"; break; + case ImmTyHwreg: OS << "Hwreg"; break; + case ImmTySendMsg: OS << "SendMsg"; break; + } } void print(raw_ostream &OS) const override { switch (Kind) { case Register: - OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>'; + OS << "<register " << getReg() << " mods: " << Reg.Mods << '>'; break; case Immediate: - OS << getImm(); + OS << '<' << getImm(); + if (getImmTy() != ImmTyNone) { + OS << " type: "; printImmTy(OS, getImmTy()); + } + OS << " mods: " << Imm.Mods << '>'; break; case Token: OS << '\'' << getToken() << '\''; @@ -275,20 +484,21 @@ public: } } - static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc, - enum ImmTy Type = ImmTyNone, - bool IsFPImm = false) { + static AMDGPUOperand::Ptr CreateImm(int64_t Val, SMLoc Loc, + enum ImmTy Type = ImmTyNone, + bool IsFPImm = false) { auto Op = llvm::make_unique<AMDGPUOperand>(Immediate); Op->Imm.Val = Val; Op->Imm.IsFPImm = IsFPImm; Op->Imm.Type = Type; + Op->Imm.Mods = {false, false, false}; Op->StartLoc = Loc; Op->EndLoc = Loc; return Op; } - static std::unique_ptr<AMDGPUOperand> CreateToken(StringRef Str, SMLoc Loc, - bool HasExplicitEncodingSize = true) { + static AMDGPUOperand::Ptr CreateToken(StringRef Str, SMLoc Loc, + bool HasExplicitEncodingSize = true) { auto Res = llvm::make_unique<AMDGPUOperand>(Token); Res->Tok.Data = Str.data(); Res->Tok.Length = Str.size(); @@ -297,43 +507,43 @@ public: return Res; } - static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S, - SMLoc E, - const MCRegisterInfo *TRI, - const MCSubtargetInfo *STI, - bool ForceVOP3) { + static AMDGPUOperand::Ptr CreateReg(unsigned RegNo, SMLoc S, + SMLoc E, + const MCRegisterInfo *TRI, + const MCSubtargetInfo *STI, + bool ForceVOP3) { auto Op = llvm::make_unique<AMDGPUOperand>(Register); Op->Reg.RegNo = RegNo; Op->Reg.TRI = TRI; Op->Reg.STI = STI; - Op->Reg.Modifiers = -1; + Op->Reg.Mods = {false, false, false}; Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; Op->EndLoc = E; return Op; } - static std::unique_ptr<AMDGPUOperand> CreateExpr(const class MCExpr *Expr, SMLoc S) { + static AMDGPUOperand::Ptr CreateExpr(const class MCExpr *Expr, SMLoc S) { auto Op = llvm::make_unique<AMDGPUOperand>(Expression); Op->Expr = Expr; Op->StartLoc = S; Op->EndLoc = S; return Op; } - - bool isDSOffset() const; - bool isDSOffset01() const; - bool isSWaitCnt() const; - bool isMubufOffset() const; - bool isSMRDOffset() const; - bool isSMRDLiteralOffset() const; }; +raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) { + OS << "abs:" << Mods.Abs << " neg: " << Mods.Neg << " sext:" << Mods.Sext; + return OS; +} + class AMDGPUAsmParser : public MCTargetAsmParser { const MCInstrInfo &MII; MCAsmParser &Parser; unsigned ForcedEncodingSize; + bool ForcedDPP; + bool ForcedSDWA; bool isSI() const { return AMDGPU::isSI(getSTI()); @@ -373,9 +583,11 @@ private: bool ParseSectionDirectiveHSADataGlobalAgent(); bool ParseSectionDirectiveHSADataGlobalProgram(); bool ParseSectionDirectiveHSARodataReadonlyAgent(); + bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum); + bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth); + void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn); public: -public: enum AMDGPUMatchResultTy { Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY }; @@ -384,7 +596,9 @@ public: const MCInstrInfo &MII, const MCTargetOptions &Options) : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0) { + ForcedEncodingSize(0), + ForcedDPP(false), + ForcedSDWA(false) { MCAsmParserExtension::Initialize(Parser); if (getSTI().getFeatureBits().none()) { @@ -393,6 +607,21 @@ public: } setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + + { + // TODO: make those pre-defined variables read-only. + // Currently there is none suitable machinery in the core llvm-mc for this. + // MCSymbol::isRedefinable is intended for another purpose, and + // AsmParser::parseDirectiveSet() cannot be specialized for specific target. + AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); + MCContext &Ctx = getContext(); + MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major")); + Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor")); + Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx)); + Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping")); + Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx)); + } } AMDGPUTargetStreamer &getTargetStreamer() { @@ -400,84 +629,117 @@ public: return static_cast<AMDGPUTargetStreamer &>(TS); } - unsigned getForcedEncodingSize() const { - return ForcedEncodingSize; - } - - void setForcedEncodingSize(unsigned Size) { - ForcedEncodingSize = Size; - } + void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; } + void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; } + void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; } - bool isForcedVOP3() const { - return ForcedEncodingSize == 64; - } + unsigned getForcedEncodingSize() const { return ForcedEncodingSize; } + bool isForcedVOP3() const { return ForcedEncodingSize == 64; } + bool isForcedDPP() const { return ForcedDPP; } + bool isForcedSDWA() const { return ForcedSDWA; } + std::unique_ptr<AMDGPUOperand> parseRegister(); bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; unsigned checkTargetMatchPredicate(MCInst &Inst) override; + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) override; bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, uint64_t &ErrorInfo, bool MatchingInlineAsm) override; bool ParseDirective(AsmToken DirectiveID) override; OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic); + StringRef parseMnemonicSuffix(StringRef Name); bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc, OperandVector &Operands) override; - OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default = 0); + OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int); OperandMatchResultTy parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); + enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone, + bool (*ConvertResult)(int64_t&) = 0); OperandMatchResultTy parseNamedBit(const char *Name, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy = - AMDGPUOperand::ImmTyNone); - OperandMatchResultTy parseOptionalOps( - const ArrayRef<OptionalOperand> &OptionalOps, - OperandVector &Operands); + enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone); + OperandMatchResultTy parseStringWithPrefix(StringRef Prefix, StringRef &Value); + OperandMatchResultTy parseImm(OperandVector &Operands); + OperandMatchResultTy parseRegOrImm(OperandVector &Operands); + OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands); + OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands); void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands); void cvtDS(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseDSOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOff01OptionalOps(OperandVector &Operands); - OperandMatchResultTy parseDSOffsetOptional(OperandVector &Operands); bool parseCnt(int64_t &IntVal); OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands); - OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + OperandMatchResultTy parseHwreg(OperandVector &Operands); - OperandMatchResultTy parseFlatOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseFlatAtomicOptionalOps(OperandVector &Operands); - void cvtFlat(MCInst &Inst, const OperandVector &Operands); +private: + struct OperandInfoTy { + int64_t Id; + bool IsSymbolic; + OperandInfoTy(int64_t Id_) : Id(Id_), IsSymbolic(false) { } + }; - void cvtMubuf(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseOffset(OperandVector &Operands); - OperandMatchResultTy parseMubufOptionalOps(OperandVector &Operands); - OperandMatchResultTy parseGLC(OperandVector &Operands); - OperandMatchResultTy parseSLC(OperandVector &Operands); - OperandMatchResultTy parseTFE(OperandVector &Operands); + bool parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId); + bool parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width); +public: + OperandMatchResultTy parseOptionalOperand(OperandVector &Operands); - OperandMatchResultTy parseDMask(OperandVector &Operands); - OperandMatchResultTy parseUNorm(OperandVector &Operands); - OperandMatchResultTy parseR128(OperandVector &Operands); + OperandMatchResultTy parseSendMsgOp(OperandVector &Operands); + OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands); + void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); } + void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); } + void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); } + AMDGPUOperand::Ptr defaultGLC() const; + AMDGPUOperand::Ptr defaultSLC() const; + AMDGPUOperand::Ptr defaultTFE() const; + + AMDGPUOperand::Ptr defaultDMask() const; + AMDGPUOperand::Ptr defaultUNorm() const; + AMDGPUOperand::Ptr defaultDA() const; + AMDGPUOperand::Ptr defaultR128() const; + AMDGPUOperand::Ptr defaultLWE() const; + AMDGPUOperand::Ptr defaultSMRDOffset() const; + AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const; + + OperandMatchResultTy parseOModOperand(OperandVector &Operands); + + void cvtId(MCInst &Inst, const OperandVector &Operands); + void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); - OperandMatchResultTy parseVOP3OptionalOps(OperandVector &Operands); + + void cvtMIMG(MCInst &Inst, const OperandVector &Operands); + void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); + + OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); + AMDGPUOperand::Ptr defaultRowMask() const; + AMDGPUOperand::Ptr defaultBankMask() const; + AMDGPUOperand::Ptr defaultBoundCtrl() const; + void cvtDPP(MCInst &Inst, const OperandVector &Operands); + + OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix, + AMDGPUOperand::ImmTy Type); + OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands); + void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands); + void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands); + void cvtSDWA(MCInst &Inst, const OperandVector &Operands, + uint64_t BasicInstType); }; struct OptionalOperand { const char *Name; AMDGPUOperand::ImmTy Type; bool IsBit; - int64_t Default; bool (*ConvertResult)(int64_t&); }; } -static int getRegClass(bool IsVgpr, unsigned RegWidth) { - if (IsVgpr) { +static int getRegClass(RegisterKind Is, unsigned RegWidth) { + if (Is == IS_VGPR) { switch (RegWidth) { default: return -1; case 1: return AMDGPU::VGPR_32RegClassID; @@ -487,109 +749,379 @@ static int getRegClass(bool IsVgpr, unsigned RegWidth) { case 8: return AMDGPU::VReg_256RegClassID; case 16: return AMDGPU::VReg_512RegClassID; } + } else if (Is == IS_TTMP) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::TTMP_32RegClassID; + case 2: return AMDGPU::TTMP_64RegClassID; + case 4: return AMDGPU::TTMP_128RegClassID; + } + } else if (Is == IS_SGPR) { + switch (RegWidth) { + default: return -1; + case 1: return AMDGPU::SGPR_32RegClassID; + case 2: return AMDGPU::SGPR_64RegClassID; + case 4: return AMDGPU::SGPR_128RegClassID; + case 8: return AMDGPU::SReg_256RegClassID; + case 16: return AMDGPU::SReg_512RegClassID; + } } - - switch (RegWidth) { - default: return -1; - case 1: return AMDGPU::SGPR_32RegClassID; - case 2: return AMDGPU::SGPR_64RegClassID; - case 4: return AMDGPU::SReg_128RegClassID; - case 8: return AMDGPU::SReg_256RegClassID; - case 16: return AMDGPU::SReg_512RegClassID; - } + return -1; } -static unsigned getRegForName(StringRef RegName) { - +static unsigned getSpecialRegForName(StringRef RegName) { return StringSwitch<unsigned>(RegName) .Case("exec", AMDGPU::EXEC) .Case("vcc", AMDGPU::VCC) .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("m0", AMDGPU::M0) .Case("scc", AMDGPU::SCC) + .Case("tba", AMDGPU::TBA) + .Case("tma", AMDGPU::TMA) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) .Case("vcc_lo", AMDGPU::VCC_LO) .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("tma_lo", AMDGPU::TMA_LO) + .Case("tma_hi", AMDGPU::TMA_HI) + .Case("tba_lo", AMDGPU::TBA_LO) + .Case("tba_hi", AMDGPU::TBA_HI) .Default(0); } bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { - const AsmToken Tok = Parser.getTok(); - StartLoc = Tok.getLoc(); - EndLoc = Tok.getEndLoc(); + auto R = parseRegister(); + if (!R) return true; + assert(R->isReg()); + RegNo = R->getReg(); + StartLoc = R->getStartLoc(); + EndLoc = R->getEndLoc(); + return false; +} + +bool AMDGPUAsmParser::AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum) +{ + switch (RegKind) { + case IS_SPECIAL: + if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) { Reg = AMDGPU::EXEC; RegWidth = 2; return true; } + if (Reg == AMDGPU::FLAT_SCR_LO && Reg1 == AMDGPU::FLAT_SCR_HI) { Reg = AMDGPU::FLAT_SCR; RegWidth = 2; return true; } + if (Reg == AMDGPU::VCC_LO && Reg1 == AMDGPU::VCC_HI) { Reg = AMDGPU::VCC; RegWidth = 2; return true; } + if (Reg == AMDGPU::TBA_LO && Reg1 == AMDGPU::TBA_HI) { Reg = AMDGPU::TBA; RegWidth = 2; return true; } + if (Reg == AMDGPU::TMA_LO && Reg1 == AMDGPU::TMA_HI) { Reg = AMDGPU::TMA; RegWidth = 2; return true; } + return false; + case IS_VGPR: + case IS_SGPR: + case IS_TTMP: + if (Reg1 != Reg + RegWidth) { return false; } + RegWidth++; + return true; + default: + assert(false); return false; + } +} + +bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth) +{ + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + if (getLexer().is(AsmToken::Identifier)) { + StringRef RegName = Parser.getTok().getString(); + if ((Reg = getSpecialRegForName(RegName))) { + Parser.Lex(); + RegKind = IS_SPECIAL; + } else { + unsigned RegNumIndex = 0; + if (RegName[0] == 'v') { + RegNumIndex = 1; + RegKind = IS_VGPR; + } else if (RegName[0] == 's') { + RegNumIndex = 1; + RegKind = IS_SGPR; + } else if (RegName.startswith("ttmp")) { + RegNumIndex = strlen("ttmp"); + RegKind = IS_TTMP; + } else { + return false; + } + if (RegName.size() > RegNumIndex) { + // Single 32-bit register: vXX. + if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum)) + return false; + Parser.Lex(); + RegWidth = 1; + } else { + // Range of registers: v[XX:YY]. ":YY" is optional. + Parser.Lex(); + int64_t RegLo, RegHi; + if (getLexer().isNot(AsmToken::LBrac)) + return false; + Parser.Lex(); + + if (getParser().parseAbsoluteExpression(RegLo)) + return false; + + const bool isRBrace = getLexer().is(AsmToken::RBrac); + if (!isRBrace && getLexer().isNot(AsmToken::Colon)) + return false; + Parser.Lex(); + + if (isRBrace) { + RegHi = RegLo; + } else { + if (getParser().parseAbsoluteExpression(RegHi)) + return false; + + if (getLexer().isNot(AsmToken::RBrac)) + return false; + Parser.Lex(); + } + RegNum = (unsigned) RegLo; + RegWidth = (RegHi - RegLo) + 1; + } + } + } else if (getLexer().is(AsmToken::LBrac)) { + // List of consecutive registers: [s0,s1,s2,s3] + Parser.Lex(); + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) + return false; + if (RegWidth != 1) + return false; + RegisterKind RegKind1; + unsigned Reg1, RegNum1, RegWidth1; + do { + if (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); + } else if (getLexer().is(AsmToken::RBrac)) { + Parser.Lex(); + break; + } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1)) { + if (RegWidth1 != 1) { + return false; + } + if (RegKind1 != RegKind) { + return false; + } + if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) { + return false; + } + } else { + return false; + } + } while (true); + } else { + return false; + } + switch (RegKind) { + case IS_SPECIAL: + RegNum = 0; + RegWidth = 1; + break; + case IS_VGPR: + case IS_SGPR: + case IS_TTMP: + { + unsigned Size = 1; + if (RegKind == IS_SGPR || RegKind == IS_TTMP) { + // SGPR and TTMP registers must be are aligned. Max required alignment is 4 dwords. + Size = std::min(RegWidth, 4u); + } + if (RegNum % Size != 0) + return false; + RegNum = RegNum / Size; + int RCID = getRegClass(RegKind, RegWidth); + if (RCID == -1) + return false; + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegNum >= RC.getNumRegs()) + return false; + Reg = RC.getRegister(RegNum); + break; + } + + default: + assert(false); return false; + } + + if (!subtargetHasRegister(*TRI, Reg)) + return false; + return true; +} + +std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() { + const auto &Tok = Parser.getTok(); + SMLoc StartLoc = Tok.getLoc(); + SMLoc EndLoc = Tok.getEndLoc(); const MCRegisterInfo *TRI = getContext().getRegisterInfo(); - StringRef RegName = Tok.getString(); - RegNo = getRegForName(RegName); + RegisterKind RegKind; + unsigned Reg, RegNum, RegWidth; - if (RegNo) { + if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) { + return nullptr; + } + return AMDGPUOperand::CreateReg(Reg, StartLoc, EndLoc, + TRI, &getSTI(), false); +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseImm(OperandVector &Operands) { + bool Minus = false; + if (getLexer().getKind() == AsmToken::Minus) { + Minus = true; Parser.Lex(); - return !subtargetHasRegister(*TRI, RegNo); } - // Match vgprs and sgprs - if (RegName[0] != 's' && RegName[0] != 'v') - return true; + SMLoc S = Parser.getTok().getLoc(); + switch(getLexer().getKind()) { + case AsmToken::Integer: { + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { + Error(S, "invalid immediate: only 32-bit values are legal"); + return MatchOperand_ParseFail; + } - bool IsVgpr = RegName[0] == 'v'; - unsigned RegWidth; - unsigned RegIndexInClass; - if (RegName.size() > 1) { - // We have a 32-bit register - RegWidth = 1; - if (RegName.substr(1).getAsInteger(10, RegIndexInClass)) - return true; + if (Minus) + IntVal *= -1; + Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); + return MatchOperand_Success; + } + case AsmToken::Real: { + // FIXME: We should emit an error if a double precisions floating-point + // value is used. I'm not sure the best way to detect this. + int64_t IntVal; + if (getParser().parseAbsoluteExpression(IntVal)) + return MatchOperand_ParseFail; + + APFloat F((float)BitsToDouble(IntVal)); + if (Minus) + F.changeSign(); + Operands.push_back( + AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S, + AMDGPUOperand::ImmTyNone, true)); + return MatchOperand_Success; + } + default: + return Minus ? MatchOperand_ParseFail : MatchOperand_NoMatch; + } +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) { + auto res = parseImm(Operands); + if (res != MatchOperand_NoMatch) { + return res; + } + + if (auto R = parseRegister()) { + assert(R->isReg()); + R->Reg.IsForcedVOP3 = isForcedVOP3(); + Operands.push_back(std::move(R)); + return MatchOperand_Success; + } + return MatchOperand_ParseFail; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands) { + // XXX: During parsing we can't determine if minus sign means + // negate-modifier or negative immediate value. + // By default we suppose it is modifier. + bool Negate = false, Abs = false, Abs2 = false; + + if (getLexer().getKind()== AsmToken::Minus) { Parser.Lex(); - } else { - // We have a register greater than 32-bits. + Negate = true; + } - int64_t RegLo, RegHi; + if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "abs") { Parser.Lex(); - if (getLexer().isNot(AsmToken::LBrac)) - return true; + Abs2 = true; + if (getLexer().isNot(AsmToken::LParen)) { + Error(Parser.getTok().getLoc(), "expected left paren after abs"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + } + if (getLexer().getKind() == AsmToken::Pipe) { + if (Abs2) { + Error(Parser.getTok().getLoc(), "expected register or immediate"); + return MatchOperand_ParseFail; + } Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegLo)) - return true; + Abs = true; + } - if (getLexer().isNot(AsmToken::Colon)) - return true; + auto Res = parseRegOrImm(Operands); + if (Res != MatchOperand_Success) { + return Res; + } + AMDGPUOperand::Modifiers Mods = {false, false, false}; + if (Negate) { + Mods.Neg = true; + } + if (Abs) { + if (getLexer().getKind() != AsmToken::Pipe) { + Error(Parser.getTok().getLoc(), "expected vertical bar"); + return MatchOperand_ParseFail; + } Parser.Lex(); - if (getParser().parseAbsoluteExpression(RegHi)) - return true; + Mods.Abs = true; + } + if (Abs2) { + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "expected closing parentheses"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Mods.Abs = true; + } - if (getLexer().isNot(AsmToken::RBrac)) - return true; + if (Mods.hasFPModifiers()) { + AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); + Op.setModifiers(Mods); + } + return MatchOperand_Success; +} - Parser.Lex(); - RegWidth = (RegHi - RegLo) + 1; - if (IsVgpr) { - // VGPR registers aren't aligned. - RegIndexInClass = RegLo; - } else { - // SGPR registers are aligned. Max alignment is 4 dwords. - unsigned Size = std::min(RegWidth, 4u); - if (RegLo % Size != 0) - return true; +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands) { + bool Sext = false; - RegIndexInClass = RegLo / Size; + if (getLexer().getKind() == AsmToken::Identifier && Parser.getTok().getString() == "sext") { + Parser.Lex(); + Sext = true; + if (getLexer().isNot(AsmToken::LParen)) { + Error(Parser.getTok().getLoc(), "expected left paren after sext"); + return MatchOperand_ParseFail; } + Parser.Lex(); } - int RCID = getRegClass(IsVgpr, RegWidth); - if (RCID == -1) - return true; - - const MCRegisterClass RC = TRI->getRegClass(RCID); - if (RegIndexInClass >= RC.getNumRegs()) - return true; + auto Res = parseRegOrImm(Operands); + if (Res != MatchOperand_Success) { + return Res; + } - RegNo = RC.getRegister(RegIndexInClass); - return !subtargetHasRegister(*TRI, RegNo); + AMDGPUOperand::Modifiers Mods = {false, false, false}; + if (Sext) { + if (getLexer().isNot(AsmToken::RParen)) { + Error(Parser.getTok().getLoc(), "expected closing parentheses"); + return MatchOperand_ParseFail; + } + Parser.Lex(); + Mods.Sext = true; + } + + if (Mods.hasIntModifiers()) { + AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back()); + Op.setModifiers(Mods); + } + return MatchOperand_Success; } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { @@ -597,7 +1129,9 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; if ((getForcedEncodingSize() == 32 && (TSFlags & SIInstrFlags::VOP3)) || - (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) + (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3)) || + (isForcedDPP() && !(TSFlags & SIInstrFlags::DPP)) || + (isForcedSDWA() && !(TSFlags & SIInstrFlags::SDWA)) ) return Match_InvalidOperand; if ((TSFlags & SIInstrFlags::VOP3) && @@ -608,7 +1142,6 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_Success; } - bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -632,31 +1165,8 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = IDLoc; if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) { - if (isForcedVOP3()) { - // If 64-bit encoding has been forced we can end up with no - // clamp or omod operands if none of the registers have modifiers, - // so we need to add these to the operand list. - AMDGPUOperand &LastOp = - ((AMDGPUOperand &)*Operands[Operands.size() - 1]); - if (LastOp.isRegKind() || - (LastOp.isImm() && - LastOp.getImmTy() != AMDGPUOperand::ImmTyNone)) { - SMLoc S = Parser.getTok().getLoc(); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyClamp)); - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOMod)); - bool Res = MatchAndEmitInstruction(IDLoc, Opcode, Operands, - Out, ErrorInfo, - MatchingInlineAsm); - if (!Res) - return Res; - } - - } return Error(IDLoc, "too few operands for instruction"); } - ErrorLoc = ((AMDGPUOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; @@ -762,164 +1272,12 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header) { - - if (getLexer().isNot(AsmToken::Equal)) - return TokError("expected '='"); - Lex(); - - if (getLexer().isNot(AsmToken::Integer)) - return TokError("amd_kernel_code_t values must be integers"); - - uint64_t Value = getLexer().getTok().getIntVal(); + SmallString<40> ErrStr; + raw_svector_ostream Err(ErrStr); + if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) { + return TokError(Err.str()); + } Lex(); - - if (ID == "kernel_code_version_major") - Header.amd_kernel_code_version_major = Value; - else if (ID == "kernel_code_version_minor") - Header.amd_kernel_code_version_minor = Value; - else if (ID == "machine_kind") - Header.amd_machine_kind = Value; - else if (ID == "machine_version_major") - Header.amd_machine_version_major = Value; - else if (ID == "machine_version_minor") - Header.amd_machine_version_minor = Value; - else if (ID == "machine_version_stepping") - Header.amd_machine_version_stepping = Value; - else if (ID == "kernel_code_entry_byte_offset") - Header.kernel_code_entry_byte_offset = Value; - else if (ID == "kernel_code_prefetch_byte_size") - Header.kernel_code_prefetch_byte_size = Value; - else if (ID == "max_scratch_backing_memory_byte_size") - Header.max_scratch_backing_memory_byte_size = Value; - else if (ID == "compute_pgm_rsrc1_vgprs") - Header.compute_pgm_resource_registers |= S_00B848_VGPRS(Value); - else if (ID == "compute_pgm_rsrc1_sgprs") - Header.compute_pgm_resource_registers |= S_00B848_SGPRS(Value); - else if (ID == "compute_pgm_rsrc1_priority") - Header.compute_pgm_resource_registers |= S_00B848_PRIORITY(Value); - else if (ID == "compute_pgm_rsrc1_float_mode") - Header.compute_pgm_resource_registers |= S_00B848_FLOAT_MODE(Value); - else if (ID == "compute_pgm_rsrc1_priv") - Header.compute_pgm_resource_registers |= S_00B848_PRIV(Value); - else if (ID == "compute_pgm_rsrc1_dx10_clamp") - Header.compute_pgm_resource_registers |= S_00B848_DX10_CLAMP(Value); - else if (ID == "compute_pgm_rsrc1_debug_mode") - Header.compute_pgm_resource_registers |= S_00B848_DEBUG_MODE(Value); - else if (ID == "compute_pgm_rsrc1_ieee_mode") - Header.compute_pgm_resource_registers |= S_00B848_IEEE_MODE(Value); - else if (ID == "compute_pgm_rsrc2_scratch_en") - Header.compute_pgm_resource_registers |= (S_00B84C_SCRATCH_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_user_sgpr") - Header.compute_pgm_resource_registers |= (S_00B84C_USER_SGPR(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tgid_x_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TGID_X_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tgid_y_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Y_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tgid_z_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TGID_Z_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tg_size_en") - Header.compute_pgm_resource_registers |= (S_00B84C_TG_SIZE_EN(Value) << 32); - else if (ID == "compute_pgm_rsrc2_tidig_comp_cnt") - Header.compute_pgm_resource_registers |= - (S_00B84C_TIDIG_COMP_CNT(Value) << 32); - else if (ID == "compute_pgm_rsrc2_excp_en_msb") - Header.compute_pgm_resource_registers |= - (S_00B84C_EXCP_EN_MSB(Value) << 32); - else if (ID == "compute_pgm_rsrc2_lds_size") - Header.compute_pgm_resource_registers |= (S_00B84C_LDS_SIZE(Value) << 32); - else if (ID == "compute_pgm_rsrc2_excp_en") - Header.compute_pgm_resource_registers |= (S_00B84C_EXCP_EN(Value) << 32); - else if (ID == "compute_pgm_resource_registers") - Header.compute_pgm_resource_registers = Value; - else if (ID == "enable_sgpr_private_segment_buffer") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT); - else if (ID == "enable_sgpr_dispatch_ptr") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT); - else if (ID == "enable_sgpr_queue_ptr") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT); - else if (ID == "enable_sgpr_kernarg_segment_ptr") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT); - else if (ID == "enable_sgpr_dispatch_id") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT); - else if (ID == "enable_sgpr_flat_scratch_init") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT); - else if (ID == "enable_sgpr_private_segment_size") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT); - else if (ID == "enable_sgpr_grid_workgroup_count_x") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT); - else if (ID == "enable_sgpr_grid_workgroup_count_y") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT); - else if (ID == "enable_sgpr_grid_workgroup_count_z") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT); - else if (ID == "enable_ordered_append_gds") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT); - else if (ID == "private_element_size") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT); - else if (ID == "is_ptr64") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_PTR64_SHIFT); - else if (ID == "is_dynamic_callstack") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT); - else if (ID == "is_debug_enabled") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT); - else if (ID == "is_xnack_enabled") - Header.code_properties |= - (Value << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT); - else if (ID == "workitem_private_segment_byte_size") - Header.workitem_private_segment_byte_size = Value; - else if (ID == "workgroup_group_segment_byte_size") - Header.workgroup_group_segment_byte_size = Value; - else if (ID == "gds_segment_byte_size") - Header.gds_segment_byte_size = Value; - else if (ID == "kernarg_segment_byte_size") - Header.kernarg_segment_byte_size = Value; - else if (ID == "workgroup_fbarrier_count") - Header.workgroup_fbarrier_count = Value; - else if (ID == "wavefront_sgpr_count") - Header.wavefront_sgpr_count = Value; - else if (ID == "workitem_vgpr_count") - Header.workitem_vgpr_count = Value; - else if (ID == "reserved_vgpr_first") - Header.reserved_vgpr_first = Value; - else if (ID == "reserved_vgpr_count") - Header.reserved_vgpr_count = Value; - else if (ID == "reserved_sgpr_first") - Header.reserved_sgpr_first = Value; - else if (ID == "reserved_sgpr_count") - Header.reserved_sgpr_count = Value; - else if (ID == "debug_wavefront_private_segment_offset_sgpr") - Header.debug_wavefront_private_segment_offset_sgpr = Value; - else if (ID == "debug_private_segment_buffer_sgpr") - Header.debug_private_segment_buffer_sgpr = Value; - else if (ID == "kernarg_segment_alignment") - Header.kernarg_segment_alignment = Value; - else if (ID == "group_segment_alignment") - Header.group_segment_alignment = Value; - else if (ID == "private_segment_alignment") - Header.private_segment_alignment = Value; - else if (ID == "wavefront_size") - Header.wavefront_size = Value; - else if (ID == "call_convention") - Header.call_convention = Value; - else if (ID == "runtime_loader_kernel_symbol") - Header.runtime_loader_kernel_symbol = Value; - else - return TokError("amd_kernel_code_t value not recognized."); - return false; } @@ -930,9 +1288,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { while (true) { - if (getLexer().isNot(AsmToken::EndOfStatement)) - return TokError("amd_kernel_code_t values must begin on a new line"); - // Lex EndOfStatement. This is in a while loop, because lexing a comment // will set the current token to EndOfStatement. while(getLexer().is(AsmToken::EndOfStatement)) @@ -1026,7 +1381,7 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); - if (IDVal == ".hsatext" || IDVal == ".text") + if (IDVal == ".hsatext") return ParseSectionDirectiveHSAText(); if (IDVal == ".amdgpu_hsa_kernel") @@ -1078,19 +1433,6 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, return true; } -static bool operandsHaveModifiers(const OperandVector &Operands) { - - for (unsigned i = 0, e = Operands.size(); i != e; ++i) { - const AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isRegKind() && Op.hasModifiers()) - return true; - if (Op.isImm() && (Op.getImmTy() == AMDGPUOperand::ImmTyOMod || - Op.getImmTy() == AMDGPUOperand::ImmTyClamp)) - return true; - } - return false; -} - AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { @@ -1107,113 +1449,59 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { getLexer().is(AsmToken::EndOfStatement)) return ResTy; - bool Negate = false, Abs = false; - if (getLexer().getKind()== AsmToken::Minus) { - Parser.Lex(); - Negate = true; - } + ResTy = parseRegOrImm(Operands); - if (getLexer().getKind() == AsmToken::Pipe) { - Parser.Lex(); - Abs = true; - } - - switch(getLexer().getKind()) { - case AsmToken::Integer: { - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; - if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { - Error(S, "invalid immediate: only 32-bit values are legal"); - return MatchOperand_ParseFail; - } - - if (Negate) - IntVal *= -1; - Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); - return MatchOperand_Success; - } - case AsmToken::Real: { - // FIXME: We should emit an error if a double precisions floating-point - // value is used. I'm not sure the best way to detect this. - SMLoc S = Parser.getTok().getLoc(); - int64_t IntVal; - if (getParser().parseAbsoluteExpression(IntVal)) - return MatchOperand_ParseFail; + if (ResTy == MatchOperand_Success) + return ResTy; - APFloat F((float)BitsToDouble(IntVal)); - if (Negate) - F.changeSign(); - Operands.push_back( - AMDGPUOperand::CreateImm(F.bitcastToAPInt().getZExtValue(), S)); + if (getLexer().getKind() == AsmToken::Identifier) { + // If this identifier is a symbol, we want to create an expression for it. + // It is a little difficult to distinguish between a symbol name, and + // an instruction flag like 'gds'. In order to do this, we parse + // all tokens as expressions and then treate the symbol name as the token + // string when we want to interpret the operand as a token. + const auto &Tok = Parser.getTok(); + SMLoc S = Tok.getLoc(); + const MCExpr *Expr = nullptr; + if (!Parser.parseExpression(Expr)) { + Operands.push_back(AMDGPUOperand::CreateExpr(Expr, S)); return MatchOperand_Success; } - case AsmToken::Identifier: { - SMLoc S, E; - unsigned RegNo; - if (!ParseRegister(RegNo, S, E)) { - - bool HasModifiers = operandsHaveModifiers(Operands); - unsigned Modifiers = 0; - if (Negate) - Modifiers |= 0x1; - - if (Abs) { - if (getLexer().getKind() != AsmToken::Pipe) - return MatchOperand_ParseFail; - Parser.Lex(); - Modifiers |= 0x2; - } - - if (Modifiers && !HasModifiers) { - // We are adding a modifier to src1 or src2 and previous sources - // don't have modifiers, so we need to go back and empty modifers - // for each previous source. - for (unsigned PrevRegIdx = Operands.size() - 1; PrevRegIdx > 1; - --PrevRegIdx) { - - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[PrevRegIdx]); - RegOp.setModifiers(0); - } - } - - - Operands.push_back(AMDGPUOperand::CreateReg( - RegNo, S, E, getContext().getRegisterInfo(), &getSTI(), - isForcedVOP3())); - - if (HasModifiers || Modifiers) { - AMDGPUOperand &RegOp = ((AMDGPUOperand&)*Operands[Operands.size() - 1]); - RegOp.setModifiers(Modifiers); - - } - } else { - Operands.push_back(AMDGPUOperand::CreateToken(Parser.getTok().getString(), - S)); - Parser.Lex(); - } - return MatchOperand_Success; - } - default: - return MatchOperand_NoMatch; + Operands.push_back(AMDGPUOperand::CreateToken(Tok.getString(), Tok.getLoc())); + Parser.Lex(); + return MatchOperand_Success; } + return MatchOperand_NoMatch; } -bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, - SMLoc NameLoc, OperandVector &Operands) { - +StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) { // Clear any forced encodings from the previous instruction. setForcedEncodingSize(0); + setForcedDPP(false); + setForcedSDWA(false); - if (Name.endswith("_e64")) + if (Name.endswith("_e64")) { setForcedEncodingSize(64); - else if (Name.endswith("_e32")) + return Name.substr(0, Name.size() - 4); + } else if (Name.endswith("_e32")) { setForcedEncodingSize(32); + return Name.substr(0, Name.size() - 4); + } else if (Name.endswith("_dpp")) { + setForcedDPP(true); + return Name.substr(0, Name.size() - 4); + } else if (Name.endswith("_sdwa")) { + setForcedSDWA(true); + return Name.substr(0, Name.size() - 5); + } + return Name; +} +bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + SMLoc NameLoc, OperandVector &Operands) { // Add the instruction mnemonic + Name = parseMnemonicSuffix(Name); Operands.push_back(AMDGPUOperand::CreateToken(Name, NameLoc)); while (!getLexer().is(AsmToken::EndOfStatement)) { @@ -1225,20 +1513,21 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, switch (Res) { case MatchOperand_Success: break; - case MatchOperand_ParseFail: return Error(getLexer().getLoc(), - "failed parsing operand."); - case MatchOperand_NoMatch: return Error(getLexer().getLoc(), - "not a valid operand."); + case MatchOperand_ParseFail: + Error(getLexer().getLoc(), "failed parsing operand."); + while (!getLexer().is(AsmToken::EndOfStatement)) { + Parser.Lex(); + } + return true; + case MatchOperand_NoMatch: + Error(getLexer().getLoc(), "not a valid operand."); + while (!getLexer().is(AsmToken::EndOfStatement)) { + Parser.Lex(); + } + return true; } } - // Once we reach end of statement, continue parsing so we can add default - // values for optional arguments. - AMDGPUAsmParser::OperandMatchResultTy Res; - while ((Res = parseOperand(Operands, Name)) != MatchOperand_NoMatch) { - if (Res != MatchOperand_Success) - return Error(getLexer().getLoc(), "failed parsing operand."); - } return false; } @@ -1247,22 +1536,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info, //===----------------------------------------------------------------------===// AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, - int64_t Default) { - - // We are at the end of the statement, and this is a default argument, so - // use a default value. - if (getLexer().is(AsmToken::EndOfStatement)) { - Int = Default; - return MatchOperand_Success; - } - +AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) { switch(getLexer().getKind()) { default: return MatchOperand_NoMatch; case AsmToken::Identifier: { - StringRef OffsetName = Parser.getTok().getString(); - if (!OffsetName.equals(Prefix)) + StringRef Name = Parser.getTok().getString(); + if (!Name.equals(Prefix)) { return MatchOperand_NoMatch; + } Parser.Lex(); if (getLexer().isNot(AsmToken::Colon)) @@ -1282,16 +1563,21 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int, AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands, - enum AMDGPUOperand::ImmTy ImmTy) { + enum AMDGPUOperand::ImmTy ImmTy, + bool (*ConvertResult)(int64_t&)) { SMLoc S = Parser.getTok().getLoc(); - int64_t Offset = 0; + int64_t Value = 0; - AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Offset); + AMDGPUAsmParser::OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value); if (Res != MatchOperand_Success) return Res; - Operands.push_back(AMDGPUOperand::CreateImm(Offset, S, ImmTy)); + if (ConvertResult && !ConvertResult(Value)) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Value, S, ImmTy)); return MatchOperand_Success; } @@ -1327,101 +1613,52 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands, return MatchOperand_Success; } -static bool operandsHasOptionalOp(const OperandVector &Operands, - const OptionalOperand &OOp) { - for (unsigned i = 0; i < Operands.size(); i++) { - const AMDGPUOperand &ParsedOp = ((const AMDGPUOperand &)*Operands[i]); - if ((ParsedOp.isImm() && ParsedOp.getImmTy() == OOp.Type) || - (ParsedOp.isToken() && ParsedOp.getToken() == OOp.Name)) - return true; +typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap; +void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands, + OptionalImmIndexMap& OptionalIdx, + enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) { + auto i = OptionalIdx.find(ImmT); + if (i != OptionalIdx.end()) { + unsigned Idx = i->second; + ((AMDGPUOperand &)*Operands[Idx]).addImmOperands(Inst, 1); + } else { + Inst.addOperand(MCOperand::createImm(Default)); } - return false; } AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOptionalOps(const ArrayRef<OptionalOperand> &OptionalOps, - OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - for (const OptionalOperand &Op : OptionalOps) { - if (operandsHasOptionalOp(Operands, Op)) - continue; - AMDGPUAsmParser::OperandMatchResultTy Res; - int64_t Value; - if (Op.IsBit) { - Res = parseNamedBit(Op.Name, Operands, Op.Type); - if (Res == MatchOperand_NoMatch) - continue; - return Res; - } - - Res = parseIntWithPrefix(Op.Name, Value, Op.Default); - - if (Res == MatchOperand_NoMatch) - continue; - - if (Res != MatchOperand_Success) - return Res; +AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix, StringRef &Value) { + if (getLexer().isNot(AsmToken::Identifier)) { + return MatchOperand_NoMatch; + } + StringRef Tok = Parser.getTok().getString(); + if (Tok != Prefix) { + return MatchOperand_NoMatch; + } - if (Op.ConvertResult && !Op.ConvertResult(Value)) { - return MatchOperand_ParseFail; - } + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) { + return MatchOperand_ParseFail; + } - Operands.push_back(AMDGPUOperand::CreateImm(Value, S, Op.Type)); - return MatchOperand_Success; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Identifier)) { + return MatchOperand_ParseFail; } - return MatchOperand_NoMatch; + + Value = Parser.getTok().getString(); + return MatchOperand_Success; } //===----------------------------------------------------------------------===// // ds //===----------------------------------------------------------------------===// -static const OptionalOperand DSOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -static const OptionalOperand DSOptionalOpsOff01 [] = { - {"offset0", AMDGPUOperand::ImmTyDSOffset0, false, 0, nullptr}, - {"offset1", AMDGPUOperand::ImmTyDSOffset1, false, 0, nullptr}, - {"gds", AMDGPUOperand::ImmTyGDS, true, 0, nullptr} -}; - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOps, Operands); -} -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOff01OptionalOps(OperandVector &Operands) { - return parseOptionalOps(DSOptionalOpsOff01, Operands); -} - -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDSOffsetOptional(OperandVector &Operands) { - SMLoc S = Parser.getTok().getLoc(); - AMDGPUAsmParser::OperandMatchResultTy Res = - parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset); - if (Res == MatchOperand_NoMatch) { - Operands.push_back(AMDGPUOperand::CreateImm(0, S, - AMDGPUOperand::ImmTyOffset)); - Res = MatchOperand_Success; - } - return Res; -} - -bool AMDGPUOperand::isDSOffset() const { - return isImm() && isUInt<16>(getImm()); -} - -bool AMDGPUOperand::isDSOffset01() const { - return isImm() && isUInt<8>(getImm()); -} - void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, const OperandVector &Operands) { - std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + OptionalImmIndexMap OptionalIdx; for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -1436,13 +1673,10 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst, OptionalIdx[Op.getImmTy()] = i; } - unsigned Offset0Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset0]; - unsigned Offset1Idx = OptionalIdx[AMDGPUOperand::ImmTyDSOffset1]; - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); - ((AMDGPUOperand &)*Operands[Offset0Idx]).addImmOperands(Inst, 1); // offset0 - ((AMDGPUOperand &)*Operands[Offset1Idx]).addImmOperands(Inst, 1); // offset1 - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 } @@ -1469,12 +1703,11 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) { OptionalIdx[Op.getImmTy()] = i; } - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); // offset + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); if (!GDSOnly) { - unsigned GDSIdx = OptionalIdx[AMDGPUOperand::ImmTyGDS]; - ((AMDGPUOperand &)*Operands[GDSIdx]).addImmOperands(Inst, 1); // gds + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS); } Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0 } @@ -1516,7 +1749,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) { CntMask = 0x7; CntShift = 4; } else if (CntName == "lgkmcnt") { - CntMask = 0x7; + CntMask = 0xf; CntShift = 8; } else { return true; @@ -1532,8 +1765,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { // Disable all counters by default. // vmcnt [3:0] // expcnt [6:4] - // lgkmcnt [10:8] - int64_t CntVal = 0x77f; + // lgkmcnt [11:8] + int64_t CntVal = 0xf7f; SMLoc S = Parser.getTok().getLoc(); switch(getLexer().getKind()) { @@ -1555,10 +1788,298 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) { return MatchOperand_Success; } +bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) { + using namespace llvm::AMDGPU::Hwreg; + + if (Parser.getTok().getString() != "hwreg") + return true; + Parser.Lex(); + + if (getLexer().isNot(AsmToken::LParen)) + return true; + Parser.Lex(); + + if (getLexer().is(AsmToken::Identifier)) { + HwReg.IsSymbolic = true; + HwReg.Id = ID_UNKNOWN_; + const StringRef tok = Parser.getTok().getString(); + for (int i = ID_SYMBOLIC_FIRST_; i < ID_SYMBOLIC_LAST_; ++i) { + if (tok == IdSymbolic[i]) { + HwReg.Id = i; + break; + } + } + Parser.Lex(); + } else { + HwReg.IsSymbolic = false; + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(HwReg.Id)) + return true; + } + + if (getLexer().is(AsmToken::RParen)) { + Parser.Lex(); + return false; + } + + // optional params + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); + + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Offset)) + return true; + + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); + + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Width)) + return true; + + if (getLexer().isNot(AsmToken::RParen)) + return true; + Parser.Lex(); + + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseHwreg(OperandVector &Operands) { + using namespace llvm::AMDGPU::Hwreg; + + int64_t Imm16Val = 0; + SMLoc S = Parser.getTok().getLoc(); + + switch(getLexer().getKind()) { + default: return MatchOperand_NoMatch; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(Imm16Val)) + return MatchOperand_NoMatch; + if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { + Error(S, "invalid immediate: only 16-bit values are legal"); + // Do not return error code, but create an imm operand anyway and proceed + // to the next operand, if any. That avoids unneccessary error messages. + } + break; + + case AsmToken::Identifier: { + OperandInfoTy HwReg(ID_UNKNOWN_); + int64_t Offset = OFFSET_DEFAULT_; + int64_t Width = WIDTH_M1_DEFAULT_ + 1; + if (parseHwregConstruct(HwReg, Offset, Width)) + return MatchOperand_ParseFail; + if (HwReg.Id < 0 || !isUInt<ID_WIDTH_>(HwReg.Id)) { + if (HwReg.IsSymbolic) + Error(S, "invalid symbolic name of hardware register"); + else + Error(S, "invalid code of hardware register: only 6-bit values are legal"); + } + if (Offset < 0 || !isUInt<OFFSET_WIDTH_>(Offset)) + Error(S, "invalid bit offset: only 5-bit values are legal"); + if ((Width-1) < 0 || !isUInt<WIDTH_M1_WIDTH_>(Width-1)) + Error(S, "invalid bitfield width: only values from 1 to 32 are legal"); + Imm16Val = (HwReg.Id << ID_SHIFT_) | (Offset << OFFSET_SHIFT_) | ((Width-1) << WIDTH_M1_SHIFT_); + } + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTyHwreg)); + return MatchOperand_Success; +} + bool AMDGPUOperand::isSWaitCnt() const { return isImm(); } +bool AMDGPUOperand::isHwreg() const { + return isImmTy(ImmTyHwreg); +} + +bool AMDGPUAsmParser::parseSendMsgConstruct(OperandInfoTy &Msg, OperandInfoTy &Operation, int64_t &StreamId) { + using namespace llvm::AMDGPU::SendMsg; + + if (Parser.getTok().getString() != "sendmsg") + return true; + Parser.Lex(); + + if (getLexer().isNot(AsmToken::LParen)) + return true; + Parser.Lex(); + + if (getLexer().is(AsmToken::Identifier)) { + Msg.IsSymbolic = true; + Msg.Id = ID_UNKNOWN_; + const std::string tok = Parser.getTok().getString(); + for (int i = ID_GAPS_FIRST_; i < ID_GAPS_LAST_; ++i) { + switch(i) { + default: continue; // Omit gaps. + case ID_INTERRUPT: case ID_GS: case ID_GS_DONE: case ID_SYSMSG: break; + } + if (tok == IdSymbolic[i]) { + Msg.Id = i; + break; + } + } + Parser.Lex(); + } else { + Msg.IsSymbolic = false; + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Msg.Id)) + return true; + if (getLexer().is(AsmToken::Integer)) + if (getParser().parseAbsoluteExpression(Msg.Id)) + Msg.Id = ID_UNKNOWN_; + } + if (Msg.Id == ID_UNKNOWN_) // Don't know how to parse the rest. + return false; + + if (!(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG)) { + if (getLexer().isNot(AsmToken::RParen)) + return true; + Parser.Lex(); + return false; + } + + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); + + assert(Msg.Id == ID_GS || Msg.Id == ID_GS_DONE || Msg.Id == ID_SYSMSG); + Operation.Id = ID_UNKNOWN_; + if (getLexer().is(AsmToken::Identifier)) { + Operation.IsSymbolic = true; + const char* const *S = (Msg.Id == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic; + const int F = (Msg.Id == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_; + const int L = (Msg.Id == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_; + const StringRef Tok = Parser.getTok().getString(); + for (int i = F; i < L; ++i) { + if (Tok == S[i]) { + Operation.Id = i; + break; + } + } + Parser.Lex(); + } else { + Operation.IsSymbolic = false; + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(Operation.Id)) + return true; + } + + if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { + // Stream id is optional. + if (getLexer().is(AsmToken::RParen)) { + Parser.Lex(); + return false; + } + + if (getLexer().isNot(AsmToken::Comma)) + return true; + Parser.Lex(); + + if (getLexer().isNot(AsmToken::Integer)) + return true; + if (getParser().parseAbsoluteExpression(StreamId)) + return true; + } + + if (getLexer().isNot(AsmToken::RParen)) + return true; + Parser.Lex(); + return false; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) { + using namespace llvm::AMDGPU::SendMsg; + + int64_t Imm16Val = 0; + SMLoc S = Parser.getTok().getLoc(); + + switch(getLexer().getKind()) { + default: + return MatchOperand_NoMatch; + case AsmToken::Integer: + // The operand can be an integer value. + if (getParser().parseAbsoluteExpression(Imm16Val)) + return MatchOperand_NoMatch; + if (Imm16Val < 0 || !isUInt<16>(Imm16Val)) { + Error(S, "invalid immediate: only 16-bit values are legal"); + // Do not return error code, but create an imm operand anyway and proceed + // to the next operand, if any. That avoids unneccessary error messages. + } + break; + case AsmToken::Identifier: { + OperandInfoTy Msg(ID_UNKNOWN_); + OperandInfoTy Operation(OP_UNKNOWN_); + int64_t StreamId = STREAM_ID_DEFAULT_; + if (parseSendMsgConstruct(Msg, Operation, StreamId)) + return MatchOperand_ParseFail; + do { + // Validate and encode message ID. + if (! ((ID_INTERRUPT <= Msg.Id && Msg.Id <= ID_GS_DONE) + || Msg.Id == ID_SYSMSG)) { + if (Msg.IsSymbolic) + Error(S, "invalid/unsupported symbolic name of message"); + else + Error(S, "invalid/unsupported code of message"); + break; + } + Imm16Val = (Msg.Id << ID_SHIFT_); + // Validate and encode operation ID. + if (Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) { + if (! (OP_GS_FIRST_ <= Operation.Id && Operation.Id < OP_GS_LAST_)) { + if (Operation.IsSymbolic) + Error(S, "invalid symbolic name of GS_OP"); + else + Error(S, "invalid code of GS_OP: only 2-bit values are legal"); + break; + } + if (Operation.Id == OP_GS_NOP + && Msg.Id != ID_GS_DONE) { + Error(S, "invalid GS_OP: NOP is for GS_DONE only"); + break; + } + Imm16Val |= (Operation.Id << OP_SHIFT_); + } + if (Msg.Id == ID_SYSMSG) { + if (! (OP_SYS_FIRST_ <= Operation.Id && Operation.Id < OP_SYS_LAST_)) { + if (Operation.IsSymbolic) + Error(S, "invalid/unsupported symbolic name of SYSMSG_OP"); + else + Error(S, "invalid/unsupported code of SYSMSG_OP"); + break; + } + Imm16Val |= (Operation.Id << OP_SHIFT_); + } + // Validate and encode stream ID. + if ((Msg.Id == ID_GS || Msg.Id == ID_GS_DONE) && Operation.Id != OP_GS_NOP) { + if (! (STREAM_ID_FIRST_ <= StreamId && StreamId < STREAM_ID_LAST_)) { + Error(S, "invalid stream id: only 2-bit values are legal"); + break; + } + Imm16Val |= (StreamId << STREAM_ID_SHIFT_); + } + } while (0); + } + break; + } + Operands.push_back(AMDGPUOperand::CreateImm(Imm16Val, S, AMDGPUOperand::ImmTySendMsg)); + return MatchOperand_Success; +} + +bool AMDGPUOperand::isSendMsg() const { + return isImmTy(ImmTySendMsg); +} + //===----------------------------------------------------------------------===// // sopp branch targets //===----------------------------------------------------------------------===// @@ -1587,33 +2108,26 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) { } //===----------------------------------------------------------------------===// -// flat +// mubuf //===----------------------------------------------------------------------===// -static const OptionalOperand FlatOptionalOps [] = { - {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; - -static const OptionalOperand FlatAtomicOptionalOps [] = { - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultGLC() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyGLC); +} -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseFlatOptionalOps(OperandVector &Operands) { - return parseOptionalOps(FlatOptionalOps, Operands); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSLC() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTySLC); } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseFlatAtomicOptionalOps(OperandVector &Operands) { - return parseOptionalOps(FlatAtomicOptionalOps, Operands); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultTFE() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyTFE); } -void AMDGPUAsmParser::cvtFlat(MCInst &Inst, - const OperandVector &Operands) { - std::map<AMDGPUOperand::ImmTy, unsigned> OptionalIdx; +void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, + const OperandVector &Operands, + bool IsAtomic, bool IsAtomicReturn) { + OptionalImmIndexMap OptionalIdx; + assert(IsAtomicReturn ? IsAtomic : true); for (unsigned i = 1, e = Operands.size(); i != e; ++i) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); @@ -1624,129 +2138,128 @@ void AMDGPUAsmParser::cvtFlat(MCInst &Inst, continue; } - // Handle 'glc' token which is sometimes hard-coded into the + // Handle the case where soffset is an immediate + if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { + Op.addImmOperands(Inst, 1); + continue; + } + + // Handle tokens like 'offen' which are sometimes hard-coded into the // asm string. There are no MCInst operands for these. - if (Op.isToken()) + if (Op.isToken()) { continue; + } + assert(Op.isImm()); // Handle optional arguments OptionalIdx[Op.getImmTy()] = i; - } - // flat atomic instructions don't have a glc argument. - if (OptionalIdx.count(AMDGPUOperand::ImmTyGLC)) { - unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; - ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); + // Copy $vdata_in operand and insert as $vdata for MUBUF_Atomic RTN insns. + if (IsAtomicReturn) { + MCInst::iterator I = Inst.begin(); // $vdata_in is always at the beginning. + Inst.insert(I, *I); } - unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; - unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; - - ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset); + if (!IsAtomic) { // glc is hard-coded. + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + } + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); } //===----------------------------------------------------------------------===// -// mubuf +// mimg //===----------------------------------------------------------------------===// -static const OptionalOperand MubufOptionalOps [] = { - {"offset", AMDGPUOperand::ImmTyOffset, false, 0, nullptr}, - {"glc", AMDGPUOperand::ImmTyGLC, true, 0, nullptr}, - {"slc", AMDGPUOperand::ImmTySLC, true, 0, nullptr}, - {"tfe", AMDGPUOperand::ImmTyTFE, true, 0, nullptr} -}; +void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) { + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseMubufOptionalOps(OperandVector &Operands) { - return parseOptionalOps(MubufOptionalOps, Operands); -} + OptionalImmIndexMap OptionalIdx; -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseOffset(OperandVector &Operands) { - return parseIntWithPrefix("offset", Operands); -} + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseGLC(OperandVector &Operands) { - return parseNamedBit("glc", Operands); -} + // Add the register arguments + if (Op.isRegOrImm()) { + Op.addRegOrImmOperands(Inst, 1); + continue; + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + assert(false); + } + } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseSLC(OperandVector &Operands) { - return parseNamedBit("slc", Operands); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseTFE(OperandVector &Operands) { - return parseNamedBit("tfe", Operands); -} +void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) { + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } -bool AMDGPUOperand::isMubufOffset() const { - return isImm() && isUInt<12>(getImm()); -} + // Add src, same as dst + ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1); -void AMDGPUAsmParser::cvtMubuf(MCInst &Inst, - const OperandVector &Operands) { - std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + OptionalImmIndexMap OptionalIdx; - for (unsigned i = 1, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); // Add the register arguments - if (Op.isReg()) { - Op.addRegOperands(Inst, 1); - continue; - } - - // Handle the case where soffset is an immediate - if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) { - Op.addImmOperands(Inst, 1); - continue; - } - - // Handle tokens like 'offen' which are sometimes hard-coded into the - // asm string. There are no MCInst operands for these. - if (Op.isToken()) { + if (Op.isRegOrImm()) { + Op.addRegOrImmOperands(Inst, 1); continue; + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + assert(false); } - assert(Op.isImm()); - - // Handle optional arguments - OptionalIdx[Op.getImmTy()] = i; } - assert(OptionalIdx.size() == 4); - - unsigned OffsetIdx = OptionalIdx[AMDGPUOperand::ImmTyOffset]; - unsigned GLCIdx = OptionalIdx[AMDGPUOperand::ImmTyGLC]; - unsigned SLCIdx = OptionalIdx[AMDGPUOperand::ImmTySLC]; - unsigned TFEIdx = OptionalIdx[AMDGPUOperand::ImmTyTFE]; + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); +} - ((AMDGPUOperand &)*Operands[OffsetIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[GLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[SLCIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[TFEIdx]).addImmOperands(Inst, 1); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDMask); } -//===----------------------------------------------------------------------===// -// mimg -//===----------------------------------------------------------------------===// +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultUNorm() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyUNorm); +} -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseDMask(OperandVector &Operands) { - return parseIntWithPrefix("dmask", Operands); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDA() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDA); } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseUNorm(OperandVector &Operands) { - return parseNamedBit("unorm", Operands); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultR128() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyR128); } -AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseR128(OperandVector &Operands) { - return parseNamedBit("r128", Operands); +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultLWE() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyLWE); } //===----------------------------------------------------------------------===// @@ -1766,6 +2279,14 @@ bool AMDGPUOperand::isSMRDLiteralOffset() const { return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); } +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyOffset); +} + //===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -1792,91 +2313,435 @@ static bool ConvertOmodDiv(int64_t &Div) { return false; } -static const OptionalOperand VOP3OptionalOps [] = { - {"clamp", AMDGPUOperand::ImmTyClamp, true, 0, nullptr}, - {"mul", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodMul}, - {"div", AMDGPUOperand::ImmTyOMod, false, 1, ConvertOmodDiv}, +static bool ConvertBoundCtrl(int64_t &BoundCtrl) { + if (BoundCtrl == 0) { + BoundCtrl = 1; + return true; + } else if (BoundCtrl == -1) { + BoundCtrl = 0; + return true; + } + return false; +} + +// Note: the order in this table matches the order of operands in AsmString. +static const OptionalOperand AMDGPUOptionalOperandTable[] = { + {"offen", AMDGPUOperand::ImmTyOffen, true, nullptr}, + {"idxen", AMDGPUOperand::ImmTyIdxen, true, nullptr}, + {"addr64", AMDGPUOperand::ImmTyAddr64, true, nullptr}, + {"offset0", AMDGPUOperand::ImmTyOffset0, false, nullptr}, + {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr}, + {"gds", AMDGPUOperand::ImmTyGDS, true, nullptr}, + {"offset", AMDGPUOperand::ImmTyOffset, false, nullptr}, + {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, + {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, + {"clamp", AMDGPUOperand::ImmTyClampSI, true, nullptr}, + {"omod", AMDGPUOperand::ImmTyOModSI, false, ConvertOmodMul}, + {"unorm", AMDGPUOperand::ImmTyUNorm, true, nullptr}, + {"da", AMDGPUOperand::ImmTyDA, true, nullptr}, + {"r128", AMDGPUOperand::ImmTyR128, true, nullptr}, + {"lwe", AMDGPUOperand::ImmTyLWE, true, nullptr}, + {"dmask", AMDGPUOperand::ImmTyDMask, false, nullptr}, + {"row_mask", AMDGPUOperand::ImmTyDppRowMask, false, nullptr}, + {"bank_mask", AMDGPUOperand::ImmTyDppBankMask, false, nullptr}, + {"bound_ctrl", AMDGPUOperand::ImmTyDppBoundCtrl, false, ConvertBoundCtrl}, + {"dst_sel", AMDGPUOperand::ImmTySdwaDstSel, false, nullptr}, + {"src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel, false, nullptr}, + {"src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr}, + {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr}, }; -static bool isVOP3(OperandVector &Operands) { - if (operandsHaveModifiers(Operands)) - return true; +AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) { + OperandMatchResultTy res; + for (const OptionalOperand &Op : AMDGPUOptionalOperandTable) { + // try to parse any optional operand here + if (Op.IsBit) { + res = parseNamedBit(Op.Name, Operands, Op.Type); + } else if (Op.Type == AMDGPUOperand::ImmTyOModSI) { + res = parseOModOperand(Operands); + } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstSel || + Op.Type == AMDGPUOperand::ImmTySdwaSrc0Sel || + Op.Type == AMDGPUOperand::ImmTySdwaSrc1Sel) { + res = parseSDWASel(Operands, Op.Name, Op.Type); + } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) { + res = parseSDWADstUnused(Operands); + } else { + res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult); + } + if (res != MatchOperand_NoMatch) { + return res; + } + } + return MatchOperand_NoMatch; +} - AMDGPUOperand &DstOp = ((AMDGPUOperand&)*Operands[1]); +AMDGPUAsmParser::OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) +{ + StringRef Name = Parser.getTok().getString(); + if (Name == "mul") { + return parseIntWithPrefix("mul", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodMul); + } else if (Name == "div") { + return parseIntWithPrefix("div", Operands, AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv); + } else { + return MatchOperand_NoMatch; + } +} - if (DstOp.isReg() && DstOp.isRegClass(AMDGPU::SGPR_64RegClassID)) - return true; +void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) { + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + for (unsigned E = Operands.size(); I != E; ++I) + ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1); +} - if (Operands.size() >= 5) - return true; +void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) { + uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; + if (TSFlags & SIInstrFlags::VOP3) { + cvtVOP3(Inst, Operands); + } else { + cvtId(Inst, Operands); + } +} - if (Operands.size() > 3) { - AMDGPUOperand &Src1Op = ((AMDGPUOperand&)*Operands[3]); - if (Src1Op.getReg() && (Src1Op.isRegClass(AMDGPU::SReg_32RegClassID) || - Src1Op.isRegClass(AMDGPU::SReg_64RegClassID))) - return true; +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (Op.isRegOrImmWithInputMods()) { + // only fp modifiers allowed in VOP3 + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isImm()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + assert(false); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); +} + +//===----------------------------------------------------------------------===// +// dpp +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isDPPCtrl() const { + bool result = isImm() && getImmTy() == ImmTyDppCtrl && isUInt<9>(getImm()); + if (result) { + int64_t Imm = getImm(); + return ((Imm >= 0x000) && (Imm <= 0x0ff)) || + ((Imm >= 0x101) && (Imm <= 0x10f)) || + ((Imm >= 0x111) && (Imm <= 0x11f)) || + ((Imm >= 0x121) && (Imm <= 0x12f)) || + (Imm == 0x130) || + (Imm == 0x134) || + (Imm == 0x138) || + (Imm == 0x13c) || + (Imm == 0x140) || + (Imm == 0x141) || + (Imm == 0x142) || + (Imm == 0x143); } return false; } AMDGPUAsmParser::OperandMatchResultTy -AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { - - // The value returned by this function may change after parsing - // an operand so store the original value here. - bool HasModifiers = operandsHaveModifiers(Operands); - - bool IsVOP3 = isVOP3(Operands); - if (HasModifiers || IsVOP3 || - getLexer().isNot(AsmToken::EndOfStatement) || - getForcedEncodingSize() == 64) { - - AMDGPUAsmParser::OperandMatchResultTy Res = - parseOptionalOps(VOP3OptionalOps, Operands); - - if (!HasModifiers && Res == MatchOperand_Success) { - // We have added a modifier operation, so we need to make sure all - // previous register operands have modifiers - for (unsigned i = 2, e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand&)*Operands[i]); - if (Op.isReg()) - Op.setModifiers(0); +AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Prefix; + int64_t Int; + + if (getLexer().getKind() == AsmToken::Identifier) { + Prefix = Parser.getTok().getString(); + } else { + return MatchOperand_NoMatch; + } + + if (Prefix == "row_mirror") { + Int = 0x140; + } else if (Prefix == "row_half_mirror") { + Int = 0x141; + } else { + // Check to prevent parseDPPCtrlOps from eating invalid tokens + if (Prefix != "quad_perm" + && Prefix != "row_shl" + && Prefix != "row_shr" + && Prefix != "row_ror" + && Prefix != "wave_shl" + && Prefix != "wave_rol" + && Prefix != "wave_shr" + && Prefix != "wave_ror" + && Prefix != "row_bcast") { + return MatchOperand_NoMatch; + } + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Colon)) + return MatchOperand_ParseFail; + + if (Prefix == "quad_perm") { + // quad_perm:[%d,%d,%d,%d] + Parser.Lex(); + if (getLexer().isNot(AsmToken::LBrac)) + return MatchOperand_ParseFail; + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int = getLexer().getTok().getIntVal(); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int += (getLexer().getTok().getIntVal() << 2); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int += (getLexer().getTok().getIntVal() << 4); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::Comma)) + return MatchOperand_ParseFail; + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int += (getLexer().getTok().getIntVal() << 6); + + Parser.Lex(); + if (getLexer().isNot(AsmToken::RBrac)) + return MatchOperand_ParseFail; + + } else { + // sel:%d + Parser.Lex(); + if (getLexer().isNot(AsmToken::Integer)) + return MatchOperand_ParseFail; + Int = getLexer().getTok().getIntVal(); + + if (Prefix == "row_shl") { + Int |= 0x100; + } else if (Prefix == "row_shr") { + Int |= 0x110; + } else if (Prefix == "row_ror") { + Int |= 0x120; + } else if (Prefix == "wave_shl") { + Int = 0x130; + } else if (Prefix == "wave_rol") { + Int = 0x134; + } else if (Prefix == "wave_shr") { + Int = 0x138; + } else if (Prefix == "wave_ror") { + Int = 0x13C; + } else if (Prefix == "row_bcast") { + if (Int == 15) { + Int = 0x142; + } else if (Int == 31) { + Int = 0x143; + } else { + return MatchOperand_ParseFail; + } + } else { + return MatchOperand_ParseFail; } } - return Res; } - return MatchOperand_NoMatch; + Parser.Lex(); // eat last token + + Operands.push_back(AMDGPUOperand::CreateImm(Int, S, + AMDGPUOperand::ImmTyDppCtrl)); + return MatchOperand_Success; } -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const { + return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const { + return AMDGPUOperand::CreateImm(0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask); +} + +AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBoundCtrl() const { + return AMDGPUOperand::CreateImm(0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl); +} + +void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands) { + OptionalImmIndexMap OptionalIdx; - unsigned i = 1; + unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - if (Desc.getNumDefs() > 0) { - ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + // Add the register arguments + if (Op.isRegOrImmWithInputMods()) { + // Only float modifiers supported in DPP + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isDPPCtrl()) { + Op.addImmOperands(Inst, 1); + } else if (Op.isImm()) { + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("Invalid operand type"); + } + } - if (operandsHaveModifiers(Operands)) { - for (unsigned e = Operands.size(); i != e; ++i) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl); +} - if (Op.isRegWithInputMods()) { - ((AMDGPUOperand &)*Operands[i]).addRegWithInputModsOperands(Inst, 2); - continue; - } - OptionalIdx[Op.getImmTy()] = i; - } +//===----------------------------------------------------------------------===// +// sdwa +//===----------------------------------------------------------------------===// - unsigned ClampIdx = OptionalIdx[AMDGPUOperand::ImmTyClamp]; - unsigned OModIdx = OptionalIdx[AMDGPUOperand::ImmTyOMod]; +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix, + AMDGPUOperand::ImmTy Type) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Value; + AMDGPUAsmParser::OperandMatchResultTy res; - ((AMDGPUOperand &)*Operands[ClampIdx]).addImmOperands(Inst, 1); - ((AMDGPUOperand &)*Operands[OModIdx]).addImmOperands(Inst, 1); - } else { - for (unsigned e = Operands.size(); i != e; ++i) - ((AMDGPUOperand &)*Operands[i]).addRegOrImmOperands(Inst, 1); + res = parseStringWithPrefix(Prefix, Value); + if (res != MatchOperand_Success) { + return res; + } + + int64_t Int; + Int = StringSwitch<int64_t>(Value) + .Case("BYTE_0", 0) + .Case("BYTE_1", 1) + .Case("BYTE_2", 2) + .Case("BYTE_3", 3) + .Case("WORD_0", 4) + .Case("WORD_1", 5) + .Case("DWORD", 6) + .Default(0xffffffff); + Parser.Lex(); // eat last token + + if (Int == 0xffffffff) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Int, S, Type)); + return MatchOperand_Success; +} + +AMDGPUAsmParser::OperandMatchResultTy +AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) { + SMLoc S = Parser.getTok().getLoc(); + StringRef Value; + AMDGPUAsmParser::OperandMatchResultTy res; + + res = parseStringWithPrefix("dst_unused", Value); + if (res != MatchOperand_Success) { + return res; + } + + int64_t Int; + Int = StringSwitch<int64_t>(Value) + .Case("UNUSED_PAD", 0) + .Case("UNUSED_SEXT", 1) + .Case("UNUSED_PRESERVE", 2) + .Default(0xffffffff); + Parser.Lex(); // eat last token + + if (Int == 0xffffffff) { + return MatchOperand_ParseFail; + } + + Operands.push_back(AMDGPUOperand::CreateImm(Int, S, + AMDGPUOperand::ImmTySdwaDstUnused)); + return MatchOperand_Success; +} + +void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP1); +} + +void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOP2); +} + +void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) { + cvtSDWA(Inst, Operands, SIInstrFlags::VOPC); +} + +void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands, + uint64_t BasicInstType) { + OptionalImmIndexMap OptionalIdx; + + unsigned I = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { + ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); + } + + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + // Add the register arguments + if (BasicInstType == SIInstrFlags::VOPC && + Op.isReg() && + Op.Reg.RegNo == AMDGPU::VCC) { + // VOPC sdwa use "vcc" token as dst. Skip it. + continue; + } else if (Op.isRegOrImmWithInputMods()) { + Op.addRegOrImmWithInputModsOperands(Inst, 2); + } else if (Op.isImm()) { + // Handle optional arguments + OptionalIdx[Op.getImmTy()] = I; + } else { + llvm_unreachable("Invalid operand type"); + } + } + + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0); + + if (Inst.getOpcode() == AMDGPU::V_NOP_sdwa) { + // V_NOP_sdwa has no optional sdwa arguments + return; + } + switch (BasicInstType) { + case SIInstrFlags::VOP1: { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + break; + } + case SIInstrFlags::VOP2: { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, 2); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + break; + } + case SIInstrFlags::VOPC: { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, 6); + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, 6); + break; + } + default: + llvm_unreachable("Invalid instruction type. Only VOP1, VOP2 and VOPC allowed"); } } @@ -1890,3 +2755,37 @@ extern "C" void LLVMInitializeAMDGPUAsmParser() { #define GET_MATCHER_IMPLEMENTATION #include "AMDGPUGenAsmMatcher.inc" + +// This fuction should be defined after auto-generated include so that we have +// MatchClassKind enum defined +unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op, + unsigned Kind) { + // Tokens like "glc" would be parsed as immediate operands in ParseOperand(). + // But MatchInstructionImpl() expects to meet token and fails to validate + // operand. This method checks if we are given immediate operand but expect to + // get corresponding token. + AMDGPUOperand &Operand = (AMDGPUOperand&)Op; + switch (Kind) { + case MCK_addr64: + return Operand.isAddr64() ? Match_Success : Match_InvalidOperand; + case MCK_gds: + return Operand.isGDS() ? Match_Success : Match_InvalidOperand; + case MCK_glc: + return Operand.isGLC() ? Match_Success : Match_InvalidOperand; + case MCK_idxen: + return Operand.isIdxen() ? Match_Success : Match_InvalidOperand; + case MCK_offen: + return Operand.isOffen() ? Match_Success : Match_InvalidOperand; + case MCK_SSrc32: + // When operands have expression values, they will return true for isToken, + // because it is not possible to distinguish between a token and an + // expression at parse time. MatchInstructionImpl() will always try to + // match an operand as a token, when isToken returns true, and when the + // name of the expression is not a valid token, the match will fail, + // so we need to handle it here. + return Operand.isSSrc32() ? Match_Success : Match_InvalidOperand; + case MCK_SoppBrTarget: + return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand; + default: return Match_InvalidOperand; + } +} diff --git a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt index 21ddc4eb83d2..70be7bb6eb36 100644 --- a/lib/Target/AMDGPU/AsmParser/CMakeLists.txt +++ b/lib/Target/AMDGPU/AsmParser/CMakeLists.txt @@ -1,3 +1,5 @@ add_llvm_library(LLVMAMDGPUAsmParser AMDGPUAsmParser.cpp ) + +add_dependencies(LLVMAMDGPUAsmParser LLVMAMDGPUUtils) diff --git a/lib/Target/AMDGPU/AsmParser/Makefile b/lib/Target/AMDGPU/AsmParser/Makefile deleted file mode 100644 index 5ad219028036..000000000000 --- a/lib/Target/AMDGPU/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AMDGPU/AsmParser/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUAsmParser - -# Hack: we need to include 'main' AMDGPU target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/CIInstructions.td b/lib/Target/AMDGPU/CIInstructions.td index c543814cae0d..f9a9f79126bd 100644 --- a/lib/Target/AMDGPU/CIInstructions.td +++ b/lib/Target/AMDGPU/CIInstructions.td @@ -25,14 +25,6 @@ // BUFFER_LOAD_DWORDX3 // BUFFER_STORE_DWORDX3 - -def isCIVI : Predicate < - "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || " - "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS" ->, AssemblerPredicate<"FeatureCIInsts">; - -def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; - //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -108,9 +100,11 @@ defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>, // MUBUF Instructions //===----------------------------------------------------------------------===// +let DisableSIDecoder = 1 in { defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol >; +} //===----------------------------------------------------------------------===// // Flat Instructions @@ -159,129 +153,114 @@ defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper < flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96 >; defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC < - flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32 + flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32, i32, atomic_swap_flat >; defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64 + flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, i32, + atomic_cmp_swap_flat, v2i32, VReg_64 >; defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < - flat<0x32, 0x42>, "flat_atomic_add", VGPR_32 + flat<0x32, 0x42>, "flat_atomic_add", VGPR_32, i32, atomic_add_flat >; defm FLAT_ATOMIC_SUB : FLAT_ATOMIC < - flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32 + flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32, i32, atomic_sub_flat >; defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC < - flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32 + flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32, i32, atomic_min_flat >; defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC < - flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32 + flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32, i32, atomic_umin_flat >; defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC < - flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32 + flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32, i32, atomic_max_flat >; defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC < - flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32 + flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32, i32, atomic_umax_flat >; defm FLAT_ATOMIC_AND : FLAT_ATOMIC < - flat<0x39, 0x48>, "flat_atomic_and", VGPR_32 + flat<0x39, 0x48>, "flat_atomic_and", VGPR_32, i32, atomic_and_flat >; defm FLAT_ATOMIC_OR : FLAT_ATOMIC < - flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32 + flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32, i32, atomic_or_flat >; defm FLAT_ATOMIC_XOR : FLAT_ATOMIC < - flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32 + flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32, i32, atomic_xor_flat >; defm FLAT_ATOMIC_INC : FLAT_ATOMIC < - flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32 + flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32, i32, atomic_inc_flat >; defm FLAT_ATOMIC_DEC : FLAT_ATOMIC < - flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32 + flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32, i32, atomic_dec_flat >; defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC < - flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64 + flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64, i64, atomic_swap_flat >; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 + flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, i64, + atomic_cmp_swap_flat, v2i64, VReg_128 >; defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < - flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64 + flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64, i64, atomic_add_flat >; defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC < - flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64 + flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64, i64, atomic_sub_flat >; defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC < - flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64 + flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64, i64, atomic_min_flat >; defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC < - flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64 + flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64, i64, atomic_umin_flat >; defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC < - flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64 + flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64, i64, atomic_max_flat >; defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC < - flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64 + flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64, i64, atomic_umax_flat >; defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC < - flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64 + flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64, i64, atomic_and_flat >; defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC < - flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64 + flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64, i64, atomic_or_flat >; defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC < - flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64 + flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64, i64, atomic_xor_flat >; defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC < - flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64 + flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64, i64, atomic_inc_flat >; defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC < - flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64 + flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64, i64, atomic_dec_flat >; } // End SubtargetPredicate = isCIVI // CI Only flat instructions -let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in { +let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 in { defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < - flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64 + flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, f32, + null_frag, v2f32, VReg_64 >; defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC < - flat<0x3f>, "flat_atomic_fmin", VGPR_32 + flat<0x3f>, "flat_atomic_fmin", VGPR_32, f32 >; defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC < - flat<0x40>, "flat_atomic_fmax", VGPR_32 + flat<0x40>, "flat_atomic_fmax", VGPR_32, f32 >; defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < - flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 + flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, f64, + null_frag, v2f64, VReg_128 >; defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC < - flat<0x5f>, "flat_atomic_fmin_x2", VReg_64 + flat<0x5f>, "flat_atomic_fmin_x2", VReg_64, f64 >; defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < - flat<0x60>, "flat_atomic_fmax_x2", VReg_64 + flat<0x60>, "flat_atomic_fmax_x2", VReg_64, f64 >; -} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst - -let Predicates = [isCI] in { - -// Convert (x - floor(x)) to fract(x) -def : Pat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isCI] - +} // End SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst, DisableVIDecoder = 1 //===----------------------------------------------------------------------===// // Flat Patterns @@ -289,12 +268,17 @@ def : Pat < let Predicates = [isCIVI] in { -// Patterns for global loads with no offset +// Patterns for global loads with no offset. class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < (vt (node i64:$addr)), (inst $addr, 0, 0, 0) >; +class FlatLoadAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 1, 0, 0) +>; + def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>; def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>; def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>; @@ -303,9 +287,20 @@ def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX2, flat_load, v2i32>; def : FlatLoadPat <FLAT_LOAD_DWORDX4, flat_load, v4i32>; +def : FlatLoadAtomicPat <FLAT_LOAD_DWORD, atomic_flat_load, i32>; +def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_flat_load, i64>; + + class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < (node vt:$data, i64:$addr), - (inst $data, $addr, 0, 0, 0) + (inst $addr, $data, 0, 0, 0) +>; + +class FlatStoreAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + // atomic store follows atomic binop convention so the address comes + // first. + (node i64:$addr, vt:$data), + (inst $addr, $data, 1, 0, 0) >; def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i32>; @@ -314,20 +309,41 @@ def : FlatStorePat <FLAT_STORE_DWORD, flat_store, i32>; def : FlatStorePat <FLAT_STORE_DWORDX2, flat_store, v2i32>; def : FlatStorePat <FLAT_STORE_DWORDX4, flat_store, v4i32>; -class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr, vt:$data)), +def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_flat_store, i32>; +def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_flat_store, i64>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt, + ValueType data_vt = vt> : Pat < + (vt (node i64:$addr, data_vt:$data)), (inst $addr, $data, 0, 0) >; def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, atomic_cmp_swap_global, i32, v2i32>; def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>; +def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, atomic_cmp_swap_global, i64, v2i64>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>; + } // End Predicates = [isCIVI] diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index b9ef0e821763..45825c9cc76a 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -10,15 +10,30 @@ tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AMDGPUGenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) add_public_tablegen_target(AMDGPUCommonTableGen) +# List of all GlobalISel files. +set(GLOBAL_ISEL_FILES + AMDGPUCallLowering.cpp + ) + +# Add GlobalISel files to the dependencies if the user wants to build it. +if(LLVM_BUILD_GLOBAL_ISEL) + set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES}) +else() + set(GLOBAL_ISEL_BUILD_FILES"") + set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES}) +endif() + + add_llvm_target(AMDGPUCodeGen AMDILCFGStructurizer.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp AMDGPUAnnotateUniformValues.cpp AMDGPUAsmPrinter.cpp - AMDGPUDiagnosticInfoUnsupported.cpp + AMDGPUCodeGenPrepare.cpp AMDGPUFrameLowering.cpp AMDGPUTargetObjectFile.cpp AMDGPUIntrinsicInfo.cpp @@ -33,10 +48,12 @@ add_llvm_target(AMDGPUCodeGen AMDGPUInstrInfo.cpp AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp + GCNHazardRecognizer.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp R600EmitClauseMarkers.cpp R600ExpandSpecialInstrs.cpp + R600FrameLowering.cpp R600InstrInfo.cpp R600ISelLowering.cpp R600MachineFunctionInfo.cpp @@ -44,11 +61,10 @@ add_llvm_target(AMDGPUCodeGen R600OptimizeVectorRegisters.cpp R600Packetizer.cpp R600RegisterInfo.cpp - R600TextureIntrinsicsReplacer.cpp SIAnnotateControlFlow.cpp + SIDebuggerInsertNops.cpp SIFixControlFlowLiveIntervals.cpp SIFixSGPRCopies.cpp - SIFixSGPRLiveRanges.cpp SIFoldOperands.cpp SIFrameLowering.cpp SIInsertWaits.cpp @@ -62,10 +78,13 @@ add_llvm_target(AMDGPUCodeGen SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp + SIWholeQuadMode.cpp + ${GLOBAL_ISEL_BUILD_FILES} ) add_subdirectory(AsmParser) add_subdirectory(InstPrinter) +add_subdirectory(Disassembler) add_subdirectory(TargetInfo) add_subdirectory(MCTargetDesc) add_subdirectory(Utils) diff --git a/lib/Target/AMDGPU/CaymanInstructions.td b/lib/Target/AMDGPU/CaymanInstructions.td index a6c3785c815b..98bc6e856ea2 100644 --- a/lib/Target/AMDGPU/CaymanInstructions.td +++ b/lib/Target/AMDGPU/CaymanInstructions.td @@ -51,7 +51,6 @@ def : RsqPat<RECIPSQRT_IEEE_cm, f32>; def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>; defm DIV_cm : DIV_Common<RECIP_IEEE_cm>; -defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>; // RECIP_UINT emulation for Cayman // The multiplication scales from [0,1] to the unsigned integer range @@ -203,27 +202,53 @@ def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0, //===----------------------------------------------------------------------===// // 8-bit reads -def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_8_cm : VTX_READ_8_cm <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +// 16-bit reads +def VTX_READ_ID1_16_cm : VTX_READ_16_cm <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))] >; // 32-bit reads -def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_32_cm : VTX_READ_32_cm <1, + [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] >; // 64-bit reads -def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_64_cm : VTX_READ_64_cm <1, + [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] >; // 128-bit reads -def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_128_cm : VTX_READ_128_cm <1, + [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 8-bit reads +def VTX_READ_ID2_8_cm : VTX_READ_8_cm <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))] +>; + +// 16-bit reads +def VTX_READ_ID2_16_cm : VTX_READ_16_cm <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_ID2_32_cm : VTX_READ_32_cm <2, + [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_ID2_64_cm : VTX_READ_64_cm <2, + [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_ID2_128_cm : VTX_READ_128_cm <2, + [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; } // End isCayman diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp new file mode 100644 index 000000000000..e11de855fe5f --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -0,0 +1,437 @@ +//===-- AMDGPUDisassembler.cpp - Disassembler for AMDGPU ISA --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This file contains definition for AMDGPU ISA disassembler +// +//===----------------------------------------------------------------------===// + +// ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)? + +#include "AMDGPUDisassembler.h" +#include "AMDGPU.h" +#include "AMDGPURegisterInfo.h" +#include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" + +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TargetRegistry.h" + + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-disassembler" + +typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; + + +inline static MCDisassembler::DecodeStatus +addOperand(MCInst &Inst, const MCOperand& Opnd) { + Inst.addOperand(Opnd); + return Opnd.isValid() ? + MCDisassembler::Success : + MCDisassembler::SoftFail; +} + +#define DECODE_OPERAND2(RegClass, DecName) \ +static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \ + unsigned Imm, \ + uint64_t /*Addr*/, \ + const void *Decoder) { \ + auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \ + return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \ +} + +#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass) + +DECODE_OPERAND(VGPR_32) +DECODE_OPERAND(VS_32) +DECODE_OPERAND(VS_64) + +DECODE_OPERAND(VReg_64) +DECODE_OPERAND(VReg_96) +DECODE_OPERAND(VReg_128) + +DECODE_OPERAND(SReg_32) +DECODE_OPERAND(SReg_32_XM0) +DECODE_OPERAND(SReg_64) +DECODE_OPERAND(SReg_128) +DECODE_OPERAND(SReg_256) +DECODE_OPERAND(SReg_512) + +#define GET_SUBTARGETINFO_ENUM +#include "AMDGPUGenSubtargetInfo.inc" +#undef GET_SUBTARGETINFO_ENUM + +#include "AMDGPUGenDisassemblerTables.inc" + +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) { + assert(Bytes.size() >= sizeof(T)); + const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data()); + Bytes = Bytes.slice(sizeof(T)); + return Res; +} + +DecodeStatus AMDGPUDisassembler::tryDecodeInst(const uint8_t* Table, + MCInst &MI, + uint64_t Inst, + uint64_t Address) const { + assert(MI.getOpcode() == 0); + assert(MI.getNumOperands() == 0); + MCInst TmpInst; + const auto SavedBytes = Bytes; + if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) { + MI = TmpInst; + return MCDisassembler::Success; + } + Bytes = SavedBytes; + return MCDisassembler::Fail; +} + +DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes_, + uint64_t Address, + raw_ostream &WS, + raw_ostream &CS) const { + CommentStream = &CS; + + // ToDo: AMDGPUDisassembler supports only VI ISA. + assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA."); + + const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size()); + Bytes = Bytes_.slice(0, MaxInstBytesNum); + + DecodeStatus Res = MCDisassembler::Fail; + do { + // ToDo: better to switch encoding length using some bit predicate + // but it is unknown yet, so try all we can + + // Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2 + // encodings + if (Bytes.size() >= 8) { + const uint64_t QW = eatBytes<uint64_t>(Bytes); + Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address); + if (Res) break; + } + + // Reinitialize Bytes as DPP64 could have eaten too much + Bytes = Bytes_.slice(0, MaxInstBytesNum); + + // Try decode 32-bit instruction + if (Bytes.size() < 4) break; + const uint32_t DW = eatBytes<uint32_t>(Bytes); + Res = tryDecodeInst(DecoderTableVI32, MI, DW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address); + if (Res) break; + + if (Bytes.size() < 4) break; + const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW; + Res = tryDecodeInst(DecoderTableVI64, MI, QW, Address); + if (Res) break; + + Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address); + } while (false); + + Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0; + return Res; +} + +const char* AMDGPUDisassembler::getRegClassName(unsigned RegClassID) const { + return getContext().getRegisterInfo()-> + getRegClassName(&AMDGPUMCRegisterClasses[RegClassID]); +} + +inline +MCOperand AMDGPUDisassembler::errOperand(unsigned V, + const Twine& ErrMsg) const { + *CommentStream << "Error: " + ErrMsg; + + // ToDo: add support for error operands to MCInst.h + // return MCOperand::createError(V); + return MCOperand(); +} + +inline +MCOperand AMDGPUDisassembler::createRegOperand(unsigned int RegId) const { + return MCOperand::createReg(RegId); +} + +inline +MCOperand AMDGPUDisassembler::createRegOperand(unsigned RegClassID, + unsigned Val) const { + const auto& RegCl = AMDGPUMCRegisterClasses[RegClassID]; + if (Val >= RegCl.getNumRegs()) + return errOperand(Val, Twine(getRegClassName(RegClassID)) + + ": unknown register " + Twine(Val)); + return createRegOperand(RegCl.getRegister(Val)); +} + +inline +MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID, + unsigned Val) const { + // ToDo: SI/CI have 104 SGPRs, VI - 102 + // Valery: here we accepting as much as we can, let assembler sort it out + int shift = 0; + switch (SRegClassID) { + case AMDGPU::SGPR_32RegClassID: + case AMDGPU::TTMP_32RegClassID: + break; + case AMDGPU::SGPR_64RegClassID: + case AMDGPU::TTMP_64RegClassID: + shift = 1; + break; + case AMDGPU::SGPR_128RegClassID: + case AMDGPU::TTMP_128RegClassID: + // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in + // this bundle? + case AMDGPU::SReg_256RegClassID: + // ToDo: unclear if s[96:104] is available on VI. Can we use VCC as SGPR in + // this bundle? + case AMDGPU::SReg_512RegClassID: + shift = 2; + break; + // ToDo: unclear if s[88:104] is available on VI. Can we use VCC as SGPR in + // this bundle? + default: + assert(false); + break; + } + if (Val % (1 << shift)) + *CommentStream << "Warning: " << getRegClassName(SRegClassID) + << ": scalar reg isn't aligned " << Val; + return createRegOperand(SRegClassID, Val >> shift); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const { + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { + return decodeSrcOp(OPW64, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { + return createRegOperand(AMDGPU::VGPR_32RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_64RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_96RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const { + return createRegOperand(AMDGPU::VReg_128RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const { + // table-gen generated disassembler doesn't care about operand types + // leaving only registry class so SSrc_32 operand turns into SReg_32 + // and therefore we accept immediates and literals here as well + return decodeSrcOp(OPW32, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0(unsigned Val) const { + // SReg_32_XM0 is SReg_32 without M0 + return decodeOperand_SReg_32(Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const { + // see decodeOperand_SReg_32 comment + return decodeSrcOp(OPW64, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const { + return createSRegOperand(AMDGPU::SReg_256RegClassID, Val); +} + +MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const { + return createSRegOperand(AMDGPU::SReg_512RegClassID, Val); +} + + +MCOperand AMDGPUDisassembler::decodeLiteralConstant() const { + // For now all literal constants are supposed to be unsigned integer + // ToDo: deal with signed/unsigned 64-bit integer constants + // ToDo: deal with float/double constants + if (Bytes.size() < 4) + return errOperand(0, "cannot read literal, inst bytes left " + + Twine(Bytes.size())); + return MCOperand::createImm(eatBytes<uint32_t>(Bytes)); +} + +MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { + using namespace AMDGPU::EncValues; + assert(Imm >= INLINE_INTEGER_C_MIN && Imm <= INLINE_INTEGER_C_MAX); + return MCOperand::createImm((Imm <= INLINE_INTEGER_C_POSITIVE_MAX) ? + (static_cast<int64_t>(Imm) - INLINE_INTEGER_C_MIN) : + (INLINE_INTEGER_C_POSITIVE_MAX - static_cast<int64_t>(Imm))); + // Cast prevents negative overflow. +} + +MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) { + assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN + && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX); + // ToDo: case 248: 1/(2*PI) - is allowed only on VI + // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as + // literal constant. + float V = 0.0f; + switch (Imm) { + case 240: V = 0.5f; break; + case 241: V = -0.5f; break; + case 242: V = 1.0f; break; + case 243: V = -1.0f; break; + case 244: V = 2.0f; break; + case 245: V = -2.0f; break; + case 246: V = 4.0f; break; + case 247: V = -4.0f; break; + case 248: return MCOperand::createImm(Is32 ? // 1/(2*PI) + 0x3e22f983 : + 0x3fc45f306dc9c882); + default: break; + } + return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V)); +} + +unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: return VGPR_32RegClassID; + case OPW64: return VReg_64RegClassID; + case OPW128: return VReg_128RegClassID; + } +} + +unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: return SGPR_32RegClassID; + case OPW64: return SGPR_64RegClassID; + case OPW128: return SGPR_128RegClassID; + } +} + +unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { + using namespace AMDGPU; + assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); + switch (Width) { + default: // fall + case OPW32: return TTMP_32RegClassID; + case OPW64: return TTMP_64RegClassID; + case OPW128: return TTMP_128RegClassID; + } +} + +MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) const { + using namespace AMDGPU::EncValues; + assert(Val < 512); // enum9 + + if (VGPR_MIN <= Val && Val <= VGPR_MAX) { + return createRegOperand(getVgprClassId(Width), Val - VGPR_MIN); + } + if (Val <= SGPR_MAX) { + assert(SGPR_MIN == 0); // "SGPR_MIN <= Val" is always true and causes compilation warning. + return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN); + } + if (TTMP_MIN <= Val && Val <= TTMP_MAX) { + return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN); + } + + assert(Width == OPW32 || Width == OPW64); + const bool Is32 = (Width == OPW32); + + if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX) + return decodeIntImmed(Val); + + if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX) + return decodeFPImmed(Is32, Val); + + if (Val == LITERAL_CONST) + return decodeLiteralConstant(); + + return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val); +} + +MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { + using namespace AMDGPU; + switch (Val) { + case 102: return createRegOperand(getMCReg(FLAT_SCR_LO, STI)); + case 103: return createRegOperand(getMCReg(FLAT_SCR_HI, STI)); + // ToDo: no support for xnack_mask_lo/_hi register + case 104: + case 105: break; + case 106: return createRegOperand(VCC_LO); + case 107: return createRegOperand(VCC_HI); + case 108: return createRegOperand(TBA_LO); + case 109: return createRegOperand(TBA_HI); + case 110: return createRegOperand(TMA_LO); + case 111: return createRegOperand(TMA_HI); + case 124: return createRegOperand(M0); + case 126: return createRegOperand(EXEC_LO); + case 127: return createRegOperand(EXEC_HI); + // ToDo: no support for vccz register + case 251: break; + // ToDo: no support for execz register + case 252: break; + case 253: return createRegOperand(SCC); + default: break; + } + return errOperand(Val, "unknown operand encoding " + Twine(Val)); +} + +MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const { + using namespace AMDGPU; + switch (Val) { + case 102: return createRegOperand(getMCReg(FLAT_SCR, STI)); + case 106: return createRegOperand(VCC); + case 108: return createRegOperand(TBA); + case 110: return createRegOperand(TMA); + case 126: return createRegOperand(EXEC); + default: break; + } + return errOperand(Val, "unknown operand encoding " + Twine(Val)); +} + +static MCDisassembler *createAMDGPUDisassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new AMDGPUDisassembler(STI, Ctx); +} + +extern "C" void LLVMInitializeAMDGPUDisassembler() { + TargetRegistry::RegisterMCDisassembler(TheGCNTarget, createAMDGPUDisassembler); +} diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h new file mode 100644 index 000000000000..dff26a044bf5 --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -0,0 +1,93 @@ +//===-- AMDGPUDisassembler.hpp - Disassembler for AMDGPU ISA ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// +/// This file contains declaration for AMDGPU ISA disassembler +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H +#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/MC/MCDisassembler/MCDisassembler.h" + +namespace llvm { + + class MCContext; + class MCInst; + class MCOperand; + class MCSubtargetInfo; + class Twine; + + class AMDGPUDisassembler : public MCDisassembler { + private: + mutable ArrayRef<uint8_t> Bytes; + + public: + AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) : + MCDisassembler(STI, Ctx) {} + + ~AMDGPUDisassembler() {} + + DecodeStatus getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &WS, raw_ostream &CS) const override; + + const char* getRegClassName(unsigned RegClassID) const; + + MCOperand createRegOperand(unsigned int RegId) const; + MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const; + MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const; + + MCOperand errOperand(unsigned V, const llvm::Twine& ErrMsg) const; + + DecodeStatus tryDecodeInst(const uint8_t* Table, + MCInst &MI, + uint64_t Inst, + uint64_t Address) const; + + MCOperand decodeOperand_VGPR_32(unsigned Val) const; + MCOperand decodeOperand_VS_32(unsigned Val) const; + MCOperand decodeOperand_VS_64(unsigned Val) const; + + MCOperand decodeOperand_VReg_64(unsigned Val) const; + MCOperand decodeOperand_VReg_96(unsigned Val) const; + MCOperand decodeOperand_VReg_128(unsigned Val) const; + + MCOperand decodeOperand_SReg_32(unsigned Val) const; + MCOperand decodeOperand_SReg_32_XM0(unsigned Val) const; + MCOperand decodeOperand_SReg_64(unsigned Val) const; + MCOperand decodeOperand_SReg_128(unsigned Val) const; + MCOperand decodeOperand_SReg_256(unsigned Val) const; + MCOperand decodeOperand_SReg_512(unsigned Val) const; + + enum OpWidthTy { + OPW32, + OPW64, + OPW128, + OPW_LAST_, + OPW_FIRST_ = OPW32 + }; + unsigned getVgprClassId(const OpWidthTy Width) const; + unsigned getSgprClassId(const OpWidthTy Width) const; + unsigned getTtmpClassId(const OpWidthTy Width) const; + + static MCOperand decodeIntImmed(unsigned Imm); + static MCOperand decodeFPImmed(bool Is32, unsigned Imm); + MCOperand decodeLiteralConstant() const; + + MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; + MCOperand decodeSpecialReg32(unsigned Val) const; + MCOperand decodeSpecialReg64(unsigned Val) const; + }; +} // namespace llvm + +#endif //LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H diff --git a/lib/Target/AMDGPU/Disassembler/CMakeLists.txt b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt new file mode 100644 index 000000000000..fb9231576919 --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMAMDGPUDisassembler + AMDGPUDisassembler.cpp + ) + +add_dependencies(LLVMAMDGPUDisassembler AMDGPUCommonTableGen LLVMAMDGPUUtils) diff --git a/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt new file mode 100644 index 000000000000..c9005f8a7884 --- /dev/null +++ b/lib/Target/AMDGPU/Disassembler/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AMDGPU/Disassembler/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AMDGPUDisassembler +parent = AMDGPU +required_libraries = AMDGPUDesc AMDGPUInfo AMDGPUUtils MC MCDisassembler Support +add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td index 2245f1417e53..94f05cc41aff 100644 --- a/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/lib/Target/AMDGPU/EvergreenInstructions.td @@ -85,8 +85,6 @@ def COS_eg : COS_Common<0x8E>; def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; -defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>; - //===----------------------------------------------------------------------===// // Memory read/write instructions //===----------------------------------------------------------------------===// @@ -212,23 +210,23 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern> // VTX Read from parameter memory space //===----------------------------------------------------------------------===// -def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0, +def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <3, [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0, +def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <3, [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0, +def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <3, [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0, +def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <3, [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, +def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <3, [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))] >; @@ -237,27 +235,53 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0, //===----------------------------------------------------------------------===// // 8-bit reads -def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1, - [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID1_8_eg : VTX_READ_8_eg <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi8 ADDRVTX_READ:$src_gpr))] +>; + +// 16-bit reads +def VTX_READ_ID1_16_eg : VTX_READ_16_eg <1, + [(set i32:$dst_gpr, (vtx_id1_az_extloadi16 ADDRVTX_READ:$src_gpr))] +>; + +// 32-bit reads +def VTX_READ_ID1_32_eg : VTX_READ_32_eg <1, + [(set i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 64-bit reads +def VTX_READ_ID1_64_eg : VTX_READ_64_eg <1, + [(set v2i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 128-bit reads +def VTX_READ_ID1_128_eg : VTX_READ_128_eg <1, + [(set v4i32:$dst_gpr, (vtx_id1_load ADDRVTX_READ:$src_gpr))] +>; + +// 8-bit reads +def VTX_READ_ID2_8_eg : VTX_READ_8_eg <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi8 ADDRVTX_READ:$src_gpr))] >; -def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1, - [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))] +// 16-bit reads +def VTX_READ_ID2_16_eg : VTX_READ_16_eg <2, + [(set i32:$dst_gpr, (vtx_id2_az_extloadi16 ADDRVTX_READ:$src_gpr))] >; // 32-bit reads -def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1, - [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID2_32_eg : VTX_READ_32_eg <2, + [(set i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; // 64-bit reads -def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1, - [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID2_64_eg : VTX_READ_64_eg <2, + [(set v2i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; // 128-bit reads -def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1, - [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))] +def VTX_READ_ID2_128_eg : VTX_READ_128_eg <2, + [(set v4i32:$dst_gpr, (vtx_id2_load ADDRVTX_READ:$src_gpr))] >; } // End Predicates = [isEG] @@ -356,8 +380,6 @@ let hasSideEffects = 1 in { def MOVA_INT_eg : R600_1OP <0xCC, "MOVA_INT", [], VecALU>; } -def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>; - def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> { let Pattern = []; let Itinerary = AnyALU; @@ -372,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> { def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>; def GROUP_BARRIER : InstR600 < - (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>, + (outs), (ins), " GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>, R600ALU_Word0, R600ALU_Word1_OP2 <0x54> { @@ -401,11 +423,6 @@ def GROUP_BARRIER : InstR600 < let ALUInst = 1; } -def : Pat < - (int_AMDGPU_barrier_global), - (GROUP_BARRIER) ->; - //===----------------------------------------------------------------------===// // LDS Instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp new file mode 100644 index 000000000000..29b1f79187d5 --- /dev/null +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -0,0 +1,264 @@ +//===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements hazard recognizers for scheduling on GCN processors. +// +//===----------------------------------------------------------------------===// + +#include "GCNHazardRecognizer.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Hazard Recoginizer Implementation +//===----------------------------------------------------------------------===// + +GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) : + CurrCycleInstr(nullptr), + MF(MF), + ST(MF.getSubtarget<SISubtarget>()) { + MaxLookAhead = 5; +} + +void GCNHazardRecognizer::EmitInstruction(SUnit *SU) { + EmitInstruction(SU->getInstr()); +} + +void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) { + CurrCycleInstr = MI; +} + +ScheduleHazardRecognizer::HazardType +GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { + MachineInstr *MI = SU->getInstr(); + + if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) + return NoopHazard; + + if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0) + return NoopHazard; + + if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) + return NoopHazard; + + return NoHazard; +} + +unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) { + return PreEmitNoops(SU->getInstr()); +} + +unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { + if (SIInstrInfo::isSMRD(*MI)) + return std::max(0, checkSMRDHazards(MI)); + + if (SIInstrInfo::isVMEM(*MI)) + return std::max(0, checkVMEMHazards(MI)); + + if (SIInstrInfo::isDPP(*MI)) + return std::max(0, checkDPPHazards(MI)); + + return 0; +} + +void GCNHazardRecognizer::EmitNoop() { + EmittedInstrs.push_front(nullptr); +} + +void GCNHazardRecognizer::AdvanceCycle() { + + // When the scheduler detects a stall, it will call AdvanceCycle() without + // emitting any instructions. + if (!CurrCycleInstr) + return; + + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr); + + // Keep track of emitted instructions + EmittedInstrs.push_front(CurrCycleInstr); + + // Add a nullptr for each additional wait state after the first. Make sure + // not to add more than getMaxLookAhead() items to the list, since we + // truncate the list to that size right after this loop. + for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead()); + i < e; ++i) { + EmittedInstrs.push_front(nullptr); + } + + // getMaxLookahead() is the largest number of wait states we will ever need + // to insert, so there is no point in keeping track of more than that many + // wait states. + EmittedInstrs.resize(getMaxLookAhead()); + + CurrCycleInstr = nullptr; +} + +void GCNHazardRecognizer::RecedeCycle() { + llvm_unreachable("hazard recognizer does not support bottom-up scheduling."); +} + +//===----------------------------------------------------------------------===// +// Helper Functions +//===----------------------------------------------------------------------===// + +int GCNHazardRecognizer::getWaitStatesSinceDef( + unsigned Reg, function_ref<bool(MachineInstr *)> IsHazardDef) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + int WaitStates = -1; + for (MachineInstr *MI : EmittedInstrs) { + ++WaitStates; + if (!MI || !IsHazardDef(MI)) + continue; + if (MI->modifiesRegister(Reg, TRI)) + return WaitStates; + } + return std::numeric_limits<int>::max(); +} + +//===----------------------------------------------------------------------===// +// No-op Hazard Detection +//===----------------------------------------------------------------------===// + +static void addRegsToSet(iterator_range<MachineInstr::const_mop_iterator> Ops, + std::set<unsigned> &Set) { + for (const MachineOperand &Op : Ops) { + if (Op.isReg()) + Set.insert(Op.getReg()); + } +} + +int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) { + // SMEM soft clause are only present on VI+ + if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return 0; + + // A soft-clause is any group of consecutive SMEM instructions. The + // instructions in this group may return out of order and/or may be + // replayed (i.e. the same instruction issued more than once). + // + // In order to handle these situations correctly we need to make sure + // that when a clause has more than one instruction, no instruction in the + // clause writes to a register that is read another instruction in the clause + // (including itself). If we encounter this situaion, we need to break the + // clause by inserting a non SMEM instruction. + + std::set<unsigned> ClauseDefs; + std::set<unsigned> ClauseUses; + + for (MachineInstr *MI : EmittedInstrs) { + + // When we hit a non-SMEM instruction then we have passed the start of the + // clause and we can stop. + if (!MI || !SIInstrInfo::isSMRD(*MI)) + break; + + addRegsToSet(MI->defs(), ClauseDefs); + addRegsToSet(MI->uses(), ClauseUses); + } + + if (ClauseDefs.empty()) + return 0; + + // FIXME: When we support stores, we need to make sure not to put loads and + // stores in the same clause if they use the same address. For now, just + // start a new clause whenever we see a store. + if (SMEM->mayStore()) + return 1; + + addRegsToSet(SMEM->defs(), ClauseDefs); + addRegsToSet(SMEM->uses(), ClauseUses); + + std::vector<unsigned> Result(std::max(ClauseDefs.size(), ClauseUses.size())); + std::vector<unsigned>::iterator End; + + End = std::set_intersection(ClauseDefs.begin(), ClauseDefs.end(), + ClauseUses.begin(), ClauseUses.end(), Result.begin()); + + // If the set of defs and uses intersect then we cannot add this instruction + // to the clause, so we have a hazard. + if (End != Result.begin()) + return 1; + + return 0; +} + +int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + int WaitStatesNeeded = 0; + + WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD); + + // This SMRD hazard only affects SI. + if (ST.getGeneration() != SISubtarget::SOUTHERN_ISLANDS) + return WaitStatesNeeded; + + // A read of an SGPR by SMRD instruction requires 4 wait states when the + // SGPR was written by a VALU instruction. + int SmrdSgprWaitStates = 4; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + + for (const MachineOperand &Use : SMRD->uses()) { + if (!Use.isReg()) + continue; + int WaitStatesNeededForUse = + SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) { + const SIInstrInfo *TII = ST.getInstrInfo(); + + if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return 0; + + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + // A read of an SGPR by a VMEM instruction requires 5 wait states when the + // SGPR was written by a VALU Instruction. + int VmemSgprWaitStates = 5; + int WaitStatesNeeded = 0; + auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); }; + + for (const MachineOperand &Use : VMEM->uses()) { + if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + + int WaitStatesNeededForUse = + VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + return WaitStatesNeeded; +} + +int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) { + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + // Check for DPP VGPR read after VALU VGPR write. + int DppVgprWaitStates = 2; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Use : DPP->uses()) { + if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg())) + continue; + int WaitStatesNeededForUse = + DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg()); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse); + } + + return WaitStatesNeeded; +} diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h new file mode 100644 index 000000000000..d82041c5f174 --- /dev/null +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -0,0 +1,62 @@ +//===-- GCNHazardRecognizers.h - GCN Hazard Recognizers ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines hazard recognizers for scheduling on GCN processors. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H +#define LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include <list> + +namespace llvm { + +class MachineFunction; +class MachineInstr; +class ScheduleDAG; +class SIInstrInfo; +class SISubtarget; + +class GCNHazardRecognizer final : public ScheduleHazardRecognizer { + // This variable stores the instruction that has been emitted this cycle. It + // will be added to EmittedInstrs, when AdvanceCycle() or RecedeCycle() is + // called. + MachineInstr *CurrCycleInstr; + std::list<MachineInstr*> EmittedInstrs; + const MachineFunction &MF; + const SISubtarget &ST; + + int getWaitStatesSinceDef(unsigned Reg, + function_ref<bool(MachineInstr *)> IsHazardDef = + [](MachineInstr *) { return true; }); + + int checkSMEMSoftClauseHazards(MachineInstr *SMEM); + int checkSMRDHazards(MachineInstr *SMRD); + int checkVMEMHazards(MachineInstr* VMEM); + int checkDPPHazards(MachineInstr *DPP); +public: + GCNHazardRecognizer(const MachineFunction &MF); + // We can only issue one instruction per cycle. + bool atIssueLimit() const override { return true; } + void EmitInstruction(SUnit *SU) override; + void EmitInstruction(MachineInstr *MI) override; + HazardType getHazardType(SUnit *SU, int Stalls) override; + void EmitNoop() override; + unsigned PreEmitNoops(SUnit *SU) override; + unsigned PreEmitNoops(MachineInstr *) override; + void AdvanceCycle() override; + void RecedeCycle() override; +}; + +} // end namespace llvm + +#endif //LLVM_LIB_TARGET_AMDGPUHAZARDRECOGNIZERS_H diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index a187de88f639..2932d3bb1580 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -11,6 +11,7 @@ #include "AMDGPUInstPrinter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" +#include "Utils/AMDGPUAsmUtils.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -18,6 +19,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <string> + using namespace llvm; void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, @@ -28,6 +31,11 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, printAnnotation(OS, Annot); } +void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatHex(MI->getOperand(OpNo).getImm() & 0xf); +} + void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << formatHex(MI->getOperand(OpNo).getImm() & 0xff); @@ -43,6 +51,11 @@ void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo, O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff); } +void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << formatDec(MI->getOperand(OpNo).getImm() & 0xf); +} + void AMDGPUInstPrinter::printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { O << formatDec(MI->getOperand(OpNo).getImm() & 0xff); @@ -53,22 +66,26 @@ void AMDGPUInstPrinter::printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, O << formatDec(MI->getOperand(OpNo).getImm() & 0xffff); } +void AMDGPUInstPrinter::printNamedBit(const MCInst* MI, unsigned OpNo, + raw_ostream& O, StringRef BitName) { + if (MI->getOperand(OpNo).getImm()) { + O << ' ' << BitName; + } +} + void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " offen"; + printNamedBit(MI, OpNo, O, "offen"); } void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " idxen"; + printNamedBit(MI, OpNo, O, "idxen"); } void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " addr64"; + printNamedBit(MI, OpNo, O, "addr64"); } void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, @@ -79,7 +96,7 @@ void AMDGPUInstPrinter::printMBUFOffset(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O) { uint16_t Imm = MI->getOperand(OpNo).getImm(); if (Imm != 0) { @@ -88,7 +105,7 @@ void AMDGPUInstPrinter::printDSOffset(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " offset0:"; @@ -96,7 +113,7 @@ void AMDGPUInstPrinter::printDSOffset0(const MCInst *MI, unsigned OpNo, } } -void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, +void AMDGPUInstPrinter::printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MI->getOperand(OpNo).getImm()) { O << " offset1:"; @@ -104,28 +121,62 @@ void AMDGPUInstPrinter::printDSOffset1(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printSMRDOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printU32ImmOperand(MI, OpNo, O); +} + void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " gds"; + printNamedBit(MI, OpNo, O, "gds"); } void AMDGPUInstPrinter::printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " glc"; + printNamedBit(MI, OpNo, O, "glc"); } void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " slc"; + printNamedBit(MI, OpNo, O, "slc"); } void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - if (MI->getOperand(OpNo).getImm()) - O << " tfe"; + printNamedBit(MI, OpNo, O, "tfe"); +} + +void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + if (MI->getOperand(OpNo).getImm()) { + O << " dmask:"; + printU16ImmOperand(MI, OpNo, O); + } +} + +void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "unorm"); +} + +void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "da"); +} + +void AMDGPUInstPrinter::printR128(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "r128"); +} + +void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + printNamedBit(MI, OpNo, O, "lwe"); } void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, @@ -152,6 +203,18 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, case AMDGPU::VCC_HI: O << "vcc_hi"; return; + case AMDGPU::TBA_LO: + O << "tba_lo"; + return; + case AMDGPU::TBA_HI: + O << "tba_hi"; + return; + case AMDGPU::TMA_LO: + O << "tma_lo"; + return; + case AMDGPU::TMA_HI: + O << "tma_hi"; + return; case AMDGPU::EXEC_LO: O << "exec_lo"; return; @@ -168,62 +231,73 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O, break; } - char Type; - unsigned NumRegs; + // The low 8 bits of the encoding value is the register index, for both VGPRs + // and SGPRs. + unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); + unsigned NumRegs; if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 1; } else if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) { - Type = 's'; + O << 's'; NumRegs = 1; } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) { - Type = 'v'; + O <<'v'; NumRegs = 2; - } else if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) { - Type = 's'; + } else if (MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(reg)) { + O << 's'; NumRegs = 2; } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 4; - } else if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) { - Type = 's'; + } else if (MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(reg)) { + O << 's'; NumRegs = 4; } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 3; } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) { - Type = 's'; + O << 's'; NumRegs = 8; } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) { - Type = 'v'; + O << 'v'; NumRegs = 16; } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) { - Type = 's'; + O << 's'; NumRegs = 16; + } else if (MRI.getRegClass(AMDGPU::TTMP_64RegClassID).contains(reg)) { + O << "ttmp"; + NumRegs = 2; + RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen. + } else if (MRI.getRegClass(AMDGPU::TTMP_128RegClassID).contains(reg)) { + O << "ttmp"; + NumRegs = 4; + RegIdx -= 112; // Trap temps start at offset 112. TODO: Get this from tablegen. } else { O << getRegisterName(reg); return; } - // The low 8 bits of the encoding value is the register index, for both VGPRs - // and SGPRs. - unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8) - 1); if (NumRegs == 1) { - O << Type << RegIdx; + O << RegIdx; return; } - O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; + O << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']'; } void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O) { if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::VOP3) O << "_e64 "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::DPP) + O << "_dpp "; + else if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SDWA) + O << "_sdwa "; else O << "_e32 "; @@ -345,12 +419,13 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCExpr *Exp = Op.getExpr(); Exp->print(O, &MAI); } else { - llvm_unreachable("unknown operand type in printOperand"); + O << "/*INV_OP*/"; } } -void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { +void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI, + unsigned OpNo, + raw_ostream &O) { unsigned InputModifiers = MI->getOperand(OpNo).getImm(); if (InputModifiers & SISrcMods::NEG) O << '-'; @@ -361,6 +436,122 @@ void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo, O << '|'; } +void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI, + unsigned OpNo, + raw_ostream &O) { + unsigned InputModifiers = MI->getOperand(OpNo).getImm(); + if (InputModifiers & SISrcMods::SEXT) + O << "sext("; + printOperand(MI, OpNo + 1, O); + if (InputModifiers & SISrcMods::SEXT) + O << ')'; +} + + +void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm <= 0x0ff) { + O << " quad_perm:["; + O << formatDec(Imm & 0x3) << ','; + O << formatDec((Imm & 0xc) >> 2) << ','; + O << formatDec((Imm & 0x30) >> 4) << ','; + O << formatDec((Imm & 0xc0) >> 6) << ']'; + } else if ((Imm >= 0x101) && (Imm <= 0x10f)) { + O << " row_shl:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= 0x111) && (Imm <= 0x11f)) { + O << " row_shr:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if ((Imm >= 0x121) && (Imm <= 0x12f)) { + O << " row_ror:"; + printU4ImmDecOperand(MI, OpNo, O); + } else if (Imm == 0x130) { + O << " wave_shl:1"; + } else if (Imm == 0x134) { + O << " wave_rol:1"; + } else if (Imm == 0x138) { + O << " wave_shr:1"; + } else if (Imm == 0x13c) { + O << " wave_ror:1"; + } else if (Imm == 0x140) { + O << " row_mirror"; + } else if (Imm == 0x141) { + O << " row_half_mirror"; + } else if (Imm == 0x142) { + O << " row_bcast:15"; + } else if (Imm == 0x143) { + O << " row_bcast:31"; + } else { + llvm_unreachable("Invalid dpp_ctrl value"); + } +} + +void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " row_mask:"; + printU4ImmOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << " bank_mask:"; + printU4ImmOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printBoundCtrl(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + if (Imm) { + O << " bound_ctrl:0"; // XXX - this syntax is used in sp3 + } +} + +void AMDGPUInstPrinter::printSDWASel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Imm = MI->getOperand(OpNo).getImm(); + switch (Imm) { + case 0: O << "BYTE_0"; break; + case 1: O << "BYTE_1"; break; + case 2: O << "BYTE_2"; break; + case 3: O << "BYTE_3"; break; + case 4: O << "WORD_0"; break; + case 5: O << "WORD_1"; break; + case 6: O << "DWORD"; break; + default: llvm_unreachable("Invalid SDWA data select operand"); + } +} + +void AMDGPUInstPrinter::printSDWADstSel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "dst_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "src0_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "src1_sel:"; + printSDWASel(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printSDWADstUnused(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + O << "dst_unused:"; + unsigned Imm = MI->getOperand(OpNo).getImm(); + switch (Imm) { + case 0: O << "UNUSED_PAD"; break; + case 1: O << "UNUSED_SEXT"; break; + case 2: O << "UNUSED_PRESERVE"; break; + default: llvm_unreachable("Invalid SDWA dest_unused operand"); + } +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNum).getImm(); @@ -395,9 +586,17 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, } } +void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, char Asm) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm()); + if (Op.getImm() == 1) + O << Asm; +} + void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printIfSet(MI, OpNo, O, "|"); + printIfSet(MI, OpNo, O, '|'); } void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, @@ -424,8 +623,15 @@ void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - int32_t Imm = MI->getOperand(OpNo).getImm(); - O << Imm << '(' << BitsToFloat(Imm) << ')'; + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm() || Op.isExpr()); + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + O << Imm << '(' << BitsToFloat(Imm) << ')'; + } + if (Op.isExpr()) { + Op.getExpr()->print(O << '@', &MAI); + } } void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, @@ -435,7 +641,7 @@ void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printIfSet(MI, OpNo, O, "-"); + printIfSet(MI, OpNo, O, '-'); } void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, @@ -456,7 +662,7 @@ void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printIfSet(MI, OpNo, O, "+"); + printIfSet(MI, OpNo, O, '+'); } void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, @@ -585,43 +791,49 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - unsigned SImm16 = MI->getOperand(OpNo).getImm(); - unsigned Msg = SImm16 & 0xF; - if (Msg == 2 || Msg == 3) { - unsigned Op = (SImm16 >> 4) & 0xF; - if (Msg == 3) - O << "Gs_done("; - else - O << "Gs("; - if (Op == 0) { - O << "nop"; - } else { - unsigned Stream = (SImm16 >> 8) & 0x3; - if (Op == 1) - O << "cut"; - else if (Op == 2) - O << "emit"; - else if (Op == 3) - O << "emit-cut"; - O << " stream " << Stream; + using namespace llvm::AMDGPU::SendMsg; + + const unsigned SImm16 = MI->getOperand(OpNo).getImm(); + const unsigned Id = SImm16 & ID_MASK_; + do { + if (Id == ID_INTERRUPT) { + if ((SImm16 & ~ID_MASK_) != 0) // Unused/unknown bits must be 0. + break; + O << "sendmsg(" << IdSymbolic[Id] << ')'; + return; } - O << "), [m0] "; - } else if (Msg == 1) - O << "interrupt "; - else if (Msg == 15) - O << "system "; - else - O << "unknown(" << Msg << ") "; + if (Id == ID_GS || Id == ID_GS_DONE) { + if ((SImm16 & ~(ID_MASK_|OP_GS_MASK_|STREAM_ID_MASK_)) != 0) // Unused/unknown bits must be 0. + break; + const unsigned OpGs = (SImm16 & OP_GS_MASK_) >> OP_SHIFT_; + const unsigned StreamId = (SImm16 & STREAM_ID_MASK_) >> STREAM_ID_SHIFT_; + if (OpGs == OP_GS_NOP && Id != ID_GS_DONE) // NOP to be used for GS_DONE only. + break; + if (OpGs == OP_GS_NOP && StreamId != 0) // NOP does not use/define stream id bits. + break; + O << "sendmsg(" << IdSymbolic[Id] << ", " << OpGsSymbolic[OpGs]; + if (OpGs != OP_GS_NOP) { O << ", " << StreamId; } + O << ')'; + return; + } + if (Id == ID_SYSMSG) { + if ((SImm16 & ~(ID_MASK_|OP_SYS_MASK_)) != 0) // Unused/unknown bits must be 0. + break; + const unsigned OpSys = (SImm16 & OP_SYS_MASK_) >> OP_SHIFT_; + if (! (OP_SYS_FIRST_ <= OpSys && OpSys < OP_SYS_LAST_)) // Unused/unknown. + break; + O << "sendmsg(" << IdSymbolic[Id] << ", " << OpSysSymbolic[OpSys] << ')'; + return; + } + } while (0); + O << SImm16; // Unknown simm16 code. } void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs - // SIInsertWaits.cpp bits usage does not match ISA docs description but it - // works so it might be a misprint in docs. unsigned SImm16 = MI->getOperand(OpNo).getImm(); unsigned Vmcnt = SImm16 & 0xF; - unsigned Expcnt = (SImm16 >> 4) & 0xF; + unsigned Expcnt = (SImm16 >> 4) & 0x7; unsigned Lgkmcnt = (SImm16 >> 8) & 0xF; bool NeedSpace = false; @@ -638,11 +850,32 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo, NeedSpace = true; } - if (Lgkmcnt != 0x7) { + if (Lgkmcnt != 0xF) { if (NeedSpace) O << ' '; O << "lgkmcnt(" << Lgkmcnt << ')'; } } +void AMDGPUInstPrinter::printHwreg(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + using namespace llvm::AMDGPU::Hwreg; + + unsigned SImm16 = MI->getOperand(OpNo).getImm(); + const unsigned Id = (SImm16 & ID_MASK_) >> ID_SHIFT_; + const unsigned Offset = (SImm16 & OFFSET_MASK_) >> OFFSET_SHIFT_; + const unsigned Width = ((SImm16 & WIDTH_M1_MASK_) >> WIDTH_M1_SHIFT_) + 1; + + O << "hwreg("; + if (ID_SYMBOLIC_FIRST_ <= Id && Id < ID_SYMBOLIC_LAST_) { + O << IdSymbolic[Id]; + } else { + O << Id; + } + if (Width != WIDTH_M1_DEFAULT_ + 1 || Offset != OFFSET_DEFAULT_) { + O << ", " << Offset << ", " << Width; + } + O << ')'; +} + #include "AMDGPUGenAsmWriter.inc" diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 90541d86132d..f5a290f16045 100644 --- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -10,8 +10,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H -#define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H +#define LLVM_LIB_TARGET_AMDGPU_INSTPRINTER_AMDGPUINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" @@ -33,37 +33,60 @@ public: const MCRegisterInfo &MRI); private: + void printU4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU8ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU16ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printNamedBit(const MCInst* MI, unsigned OpNo, raw_ostream& O, + StringRef BitName); void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printMBUFOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printDSOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset0(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOffset1(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSMRDOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printGDS(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printGLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printUNorm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDA(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printR128(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLWE(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printImmediate32(uint32_t I, raw_ostream &O); void printImmediate64(uint64_t I, raw_ostream &O); void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printDPPCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printRowMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBankMask(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printBoundCtrl(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWADstSel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWASrc0Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWASrc1Sel(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSDWADstUnused(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O); void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm, StringRef Default = ""); + static void printIfSet(const MCInst *MI, unsigned OpNo, + raw_ostream &O, char Asm); static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printClampSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printOModSI(const MCInst *MI, unsigned OpNo, raw_ostream &O); - static void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O); @@ -79,6 +102,7 @@ private: static void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printSendMsg(const MCInst *MI, unsigned OpNo, raw_ostream &O); static void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O); + static void printHwreg(const MCInst *MI, unsigned OpNo, raw_ostream &O); }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt index ce63bd553b9c..7191ff2c4577 100644 --- a/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt +++ b/lib/Target/AMDGPU/InstPrinter/CMakeLists.txt @@ -1,3 +1,5 @@ add_llvm_library(LLVMAMDGPUAsmPrinter AMDGPUInstPrinter.cpp ) + +add_dependencies(LLVMAMDGPUAsmPrinter LLVMAMDGPUUtils) diff --git a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt index fdb43844dc63..30c2670316c8 100644 --- a/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt +++ b/lib/Target/AMDGPU/InstPrinter/LLVMBuild.txt @@ -19,6 +19,6 @@ type = Library name = AMDGPUAsmPrinter parent = AMDGPU -required_libraries = MC Support +required_libraries = MC Support AMDGPUUtils add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/InstPrinter/Makefile b/lib/Target/AMDGPU/InstPrinter/Makefile deleted file mode 100644 index 4e48ac7e28a9..000000000000 --- a/lib/Target/AMDGPU/InstPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUAsmPrinter - -# Hack: we need to include 'main' x86 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/LLVMBuild.txt b/lib/Target/AMDGPU/LLVMBuild.txt index 38c5489586f1..bbdd17737cf0 100644 --- a/lib/Target/AMDGPU/LLVMBuild.txt +++ b/lib/Target/AMDGPU/LLVMBuild.txt @@ -1,4 +1,4 @@ -;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===; +;===- ./lib/Target/AMDGPU/LLVMBuild.txt ------------------------*- Conf -*--===; ; ; The LLVM Compiler Infrastructure ; @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo Utils +subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils [component_0] type = TargetGroup @@ -24,10 +24,11 @@ name = AMDGPU parent = Target has_asmparser = 1 has_asmprinter = 1 +has_disassembler = 1 [component_1] type = Library name = AMDGPUCodeGen parent = AMDGPU -required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmParser AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils +required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize add_to_library_groups = AMDGPU diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 60e8c8f3d303..1cb9d21408c6 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -53,7 +53,8 @@ public: const MCAsmLayout &Layout) const override { return false; } - void relaxInstruction(const MCInst &Inst, MCInst &Res) const override { + void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI, + MCInst &Res) const override { assert(!"Not implemented"); } bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } @@ -73,12 +74,17 @@ void AMDGPUMCObjectWriter::writeObject(MCAssembler &Asm, static unsigned getFixupKindNumBytes(unsigned Kind) { switch (Kind) { + case FK_SecRel_1: case FK_Data_1: return 1; + case FK_SecRel_2: case FK_Data_2: return 2; + case FK_SecRel_4: case FK_Data_4: + case FK_PCRel_4: return 4; + case FK_SecRel_8: case FK_Data_8: return 8; default: @@ -92,32 +98,15 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, switch ((unsigned)Fixup.getKind()) { case AMDGPU::fixup_si_sopp_br: { + int64_t BrImm = ((int64_t)Value - 4) / 4; + if (!isInt<16>(BrImm)) + report_fatal_error("branch size exceeds simm16"); + uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset()); - *Dst = (Value - 4) / 4; + *Dst = BrImm; break; } - case AMDGPU::fixup_si_rodata: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // We emit constant data at the end of the text section and generate its - // address using the following code sequence: - // s_getpc_b64 s[0:1] - // s_add_u32 s0, s0, $symbol - // s_addc_u32 s1, s1, 0 - // - // s_getpc_b64 returns the address of the s_add_u32 instruction and then - // the fixup replaces $symbol with a literal constant, which is a - // pc-relative offset from the encoding of the $symbol operand to the - // constant data. - // - // What we want here is an offset from the start of the s_add_u32 - // instruction to the constant data, but since the encoding of $symbol - // starts 4 bytes after the start of the add instruction, we end up - // with an offset that is 4 bytes too small. This requires us to - // add 4 to the fixup value before applying it. - *Dst = Value + 4; - break; - } default: { // FIXME: Copied from AArch64 unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); @@ -144,7 +133,6 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { // name offset bits flags { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel } }; if (Kind < FirstTargetFixupKind) @@ -167,13 +155,15 @@ namespace { class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend { bool Is64Bit; + bool HasRelocationAddend; public: - ELFAMDGPUAsmBackend(const Target &T, bool Is64Bit) : - AMDGPUAsmBackend(T), Is64Bit(Is64Bit) { } + ELFAMDGPUAsmBackend(const Target &T, const Triple &TT) : + AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn), + HasRelocationAddend(TT.getOS() == Triple::AMDHSA) { } MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { - return createAMDGPUELFObjectWriter(Is64Bit, OS); + return createAMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend, OS); } }; @@ -182,8 +172,6 @@ public: MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU) { - Triple TargetTriple(TT); - // Use 64-bit ELF for amdgcn - return new ELFAMDGPUAsmBackend(T, TargetTriple.getArch() == Triple::amdgcn); + return new ELFAMDGPUAsmBackend(T, TT); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 820f17df8960..b4e3b8e896bd 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -18,23 +18,56 @@ namespace { class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter { public: - AMDGPUELFObjectWriter(bool Is64Bit); + AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend); protected: - unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsPCRel) const override { - return Fixup.getKind(); - } - + unsigned getRelocType(MCContext &Ctx, const MCValue &Target, + const MCFixup &Fixup, bool IsPCRel) const override; }; } // End anonymous namespace -AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit) - : MCELFObjectTargetWriter(Is64Bit, ELF::ELFOSABI_AMDGPU_HSA, - ELF::EM_AMDGPU, false) { } +AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, + bool HasRelocationAddend) + : MCELFObjectTargetWriter(Is64Bit, + ELF::ELFOSABI_AMDGPU_HSA, + ELF::EM_AMDGPU, + HasRelocationAddend) { } + +unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, + const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + // SCRATCH_RSRC_DWORD[01] is a special global variable that represents + // the scratch buffer. + if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD0") + return ELF::R_AMDGPU_ABS32_LO; + if (Target.getSymA()->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") + return ELF::R_AMDGPU_ABS32_HI; + + switch (Target.getAccessVariant()) { + default: + break; + case MCSymbolRefExpr::VK_GOTPCREL: + return ELF::R_AMDGPU_GOTPCREL; + } + + switch (Fixup.getKind()) { + default: break; + case FK_PCRel_4: + return ELF::R_AMDGPU_REL32; + case FK_SecRel_4: + return ELF::R_AMDGPU_ABS32; + } + + llvm_unreachable("unhandled relocation type"); +} + -MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, raw_pwrite_stream &OS) { - MCELFObjectTargetWriter *MOTW = new AMDGPUELFObjectWriter(Is64Bit); +MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit, + bool HasRelocationAddend, + raw_pwrite_stream &OS) { + MCELFObjectTargetWriter *MOTW = + new AMDGPUELFObjectWriter(Is64Bit, HasRelocationAddend); return createELFObjectWriter(MOTW, OS, true); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp index 9ff9fe794d2b..43338a5bebd2 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -12,11 +12,6 @@ using namespace llvm; -void AMDGPUELFStreamer::InitSections(bool NoExecStack) { - // Start with the .hsatext section by default. - SwitchSection(AMDGPU::getHSATextSection(getContext())); -} - MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h index 488d7e74d741..5319b65d65f9 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -1,4 +1,4 @@ -//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===// +//===-------- AMDGPUELFStreamer.h - ELF Object Output -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -29,7 +29,6 @@ public: MCCodeEmitter *Emitter) : MCELFStreamer(Context, MAB, OS, Emitter) { } - virtual void InitSections(bool NoExecStac) override; }; MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h index 59a9178082f6..20c1adfbc6b9 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -7,8 +7,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUFIXUPKINDS_H #include "llvm/MC/MCFixup.h" @@ -18,9 +18,6 @@ enum Fixups { /// 16-bit PC relative fixup for SOPP branch instructions. fixup_si_sopp_br = FirstTargetFixupKind, - /// fixup for global addresses with constant initializers - fixup_si_rodata, - // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 4bc80a028936..1655591abf39 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -9,12 +9,15 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCAsmInfo.h" +#include "llvm/ADT/Triple.h" using namespace llvm; + AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { HasSingleParameterDotFile = false; //===------------------------------------------------------------------===// - MaxInstLength = 16; + MinInstAlignment = 4; + MaxInstLength = (TT.getArch() == Triple::amdgcn) ? 8 : 16; SeparatorString = "\n"; CommentString = ";"; PrivateLabelPrefix = ""; diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h index a546961705d7..8cb33a3179cd 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCASMINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCASMINFO_H #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h index c95742762233..c942ea904085 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H #include "llvm/MC/MCCodeEmitter.h" #include "llvm/Support/raw_ostream.h" diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index f70409470276..a0d9aab114fc 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -18,7 +18,6 @@ #include "AMDGPUTargetStreamer.h" #include "InstPrinter/AMDGPUInstPrinter.h" #include "SIDefines.h" -#include "llvm/MC/MCCodeGenInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -56,15 +55,6 @@ createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS); } -static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(const Triple &TT, - Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->initMCCodeGenInfo(RM, CM, OL); - return X; -} - static MCInstPrinter *createAMDGPUMCInstPrinter(const Triple &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, @@ -99,7 +89,6 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); - TargetRegistry::RegisterMCCodeGenInfo(*T, createAMDGPUMCCodeGenInfo); TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 5d1b86b8c0c2..9ab7940812ba 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -13,13 +13,13 @@ //===----------------------------------------------------------------------===// // -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCTARGETDESC_H #include "llvm/Support/DataTypes.h" -#include "llvm/ADT/StringRef.h" namespace llvm { +class StringRef; class MCAsmBackend; class MCCodeEmitter; class MCContext; @@ -47,6 +47,7 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU); MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit, + bool HasRelocationAddend, raw_pwrite_stream &OS); } // End llvm namespace diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index b91134d2ee9b..83dcaacb738f 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -312,10 +312,6 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { MCStreamer &OS = getStreamer(); OS.PushSection(); - // The MCObjectFileInfo that is available to the assembler is a generic - // implementation and not AMDGPUHSATargetObjectFile, so we can't use - // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection. - OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext())); OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header))); OS.PopSection(); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index 83bb728f541c..b3d59e8f396e 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -7,16 +7,16 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H -#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Debug.h" + namespace llvm { class MCELFStreamer; +class MCSymbol; class AMDGPUTargetStreamer : public MCTargetStreamer { public: diff --git a/lib/Target/AMDGPU/MCTargetDesc/Makefile b/lib/Target/AMDGPU/MCTargetDesc/Makefile deleted file mode 100644 index 5ad68662d98c..000000000000 --- a/lib/Target/AMDGPU/MCTargetDesc/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUDesc - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 3c1142dd664b..5e8e6ceb7ca2 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "R600Defines.h" +#include "MCTargetDesc/AMDGPUFixupKinds.h" #include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/MC/MCCodeEmitter.h" @@ -51,12 +52,9 @@ public: const MCSubtargetInfo &STI) const override; private: - void EmitByte(unsigned int byte, raw_ostream &OS) const; - void Emit(uint32_t value, raw_ostream &OS) const; void Emit(uint64_t value, raw_ostream &OS) const; - unsigned getHWRegChan(unsigned reg) const; unsigned getHWReg(unsigned regNo) const; }; @@ -142,10 +140,6 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, } } -void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const { - OS.write((uint8_t) Byte & 0xff); -} - void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const { support::endian::Writer<support::little>(OS).write(Value); } @@ -154,17 +148,13 @@ void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const { support::endian::Writer<support::little>(OS).write(Value); } -unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const { - return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT; -} - unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const { return MRI.getEncodingValue(RegNo) & HW_REG_MASK; } uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl<MCFixup> &Fixup, + SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { if (MO.isReg()) { if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) @@ -172,6 +162,18 @@ uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI, return getHWReg(MO.getReg()); } + if (MO.isExpr()) { + // We put rodata at the end of code section, then map the entire + // code secetion as vtx buf. Thus the section relative address is the + // correct one. + // Each R600 literal instruction has two operands + // We can't easily get the order of the current one, so compare against + // the first one and adjust offset. + const unsigned offset = (&MO == &MI.getOperand(0)) ? 0 : 4; + Fixups.push_back(MCFixup::create(offset, MO.getExpr(), FK_SecRel_4, MI.getLoc())); + return 0; + } + assert(MO.isImm()); return MO.getImm(); } diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 9eb3dadbc5e2..71b585c25ac5 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -162,20 +162,30 @@ static uint32_t getLit64Encoding(uint64_t Val) { uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, unsigned OpSize) const { - if (MO.isExpr()) - return 255; - assert(!MO.isFPImm()); + int64_t Imm; + if (MO.isExpr()) { + const MCConstantExpr *C = dyn_cast<MCConstantExpr>(MO.getExpr()); + if (!C) + return 255; + + Imm = C->getValue(); + } else { - if (!MO.isImm()) - return ~0; + assert(!MO.isFPImm()); + + if (!MO.isImm()) + return ~0; + + Imm = MO.getImm(); + } if (OpSize == 4) - return getLit32Encoding(static_cast<uint32_t>(MO.getImm())); + return getLit32Encoding(static_cast<uint32_t>(Imm)); assert(OpSize == 8); - return getLit64Encoding(static_cast<uint64_t>(MO.getImm())); + return getLit64Encoding(static_cast<uint64_t>(Imm)); } void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -213,7 +223,11 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, if (Op.isImm()) Imm = Op.getImm(); - else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. + else if (Op.isExpr()) { + if (const MCConstantExpr *C = dyn_cast<MCConstantExpr>(Op.getExpr())) + Imm = C->getValue(); + + } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value. llvm_unreachable("Must be immediate or expr"); for (unsigned j = 0; j < 4; j++) { @@ -247,10 +261,14 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, if (MO.isReg()) return MRI.getEncodingValue(MO.getReg()); - if (MO.isExpr()) { - const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); - MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); + if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) { + const MCSymbolRefExpr *Expr = dyn_cast<MCSymbolRefExpr>(MO.getExpr()); + MCFixupKind Kind; + if (Expr && Expr->getSymbol().isExternal()) + Kind = FK_Data_4; + else + Kind = FK_PCRel_4; + Fixups.push_back(MCFixup::create(4, MO.getExpr(), Kind, MI.getLoc())); } // Figure out the operand number, needed for isSrcOperand check diff --git a/lib/Target/AMDGPU/Makefile b/lib/Target/AMDGPU/Makefile deleted file mode 100644 index 219f34daa24f..000000000000 --- a/lib/Target/AMDGPU/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMAMDGPUCodeGen -TARGET = AMDGPU - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \ - AMDGPUGenDAGISel.inc AMDGPUGenSubtargetInfo.inc \ - AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \ - AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \ - AMDGPUGenAsmWriter.inc AMDGPUGenAsmMatcher.inc - -DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc Utils - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td index 4300d972d46b..f5f1eb14e993 100644 --- a/lib/Target/AMDGPU/Processors.td +++ b/lib/Target/AMDGPU/Processors.td @@ -13,11 +13,8 @@ class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Featur //===----------------------------------------------------------------------===// // R600 //===----------------------------------------------------------------------===// -def : Proc<"", R600_VLIW5_Itin, - [FeatureR600, FeatureVertexCache]>; - def : Proc<"r600", R600_VLIW5_Itin, - [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; + [FeatureR600, FeatureVertexCache, FeatureWavefrontSize64]>; def : Proc<"r630", R600_VLIW5_Itin, [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; @@ -84,11 +81,11 @@ def : Proc<"cayman", R600_VLIW4_Itin, //===----------------------------------------------------------------------===// def : ProcessorModel<"SI", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] + [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops] >; -def : ProcessorModel<"tahiti", SIFullSpeedModel, - [FeatureSouthernIslands, FeatureFastFMAF32] +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureSouthernIslands, FeatureFastFMAF32, HalfRate64Ops] >; def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, [FeatureSouthernIslands]>; @@ -116,8 +113,8 @@ def : ProcessorModel<"kaveri", SIQuarterSpeedModel, >; def : ProcessorModel<"hawaii", SIFullSpeedModel, - [FeatureSeaIslands, FeatureFastFMAF32, FeatureLDSBankCount32, - FeatureISAVersion7_0_1] + [FeatureSeaIslands, FeatureFastFMAF32, HalfRate64Ops, + FeatureLDSBankCount32, FeatureISAVersion7_0_1] >; def : ProcessorModel<"mullins", SIQuarterSpeedModel, @@ -148,3 +145,11 @@ def : ProcessorModel<"fiji", SIQuarterSpeedModel, def : ProcessorModel<"stoney", SIQuarterSpeedModel, [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount16] >; + +def : ProcessorModel<"polaris10", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] +>; + +def : ProcessorModel<"polaris11", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1, FeatureLDSBankCount32] +>; diff --git a/lib/Target/AMDGPU/R600ClauseMergePass.cpp b/lib/Target/AMDGPU/R600ClauseMergePass.cpp index 3cb90218a7d5..3ccde79e2df4 100644 --- a/lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ b/lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -31,8 +31,8 @@ using namespace llvm; namespace { -static bool isCFAlu(const MachineInstr *MI) { - switch (MI->getOpcode()) { +static bool isCFAlu(const MachineInstr &MI) { + switch (MI.getOpcode()) { case AMDGPU::CF_ALU: case AMDGPU::CF_ALU_PUSH_BEFORE: return true; @@ -47,19 +47,19 @@ private: static char ID; const R600InstrInfo *TII; - unsigned getCFAluSize(const MachineInstr *MI) const; - bool isCFAluEnabled(const MachineInstr *MI) const; + unsigned getCFAluSize(const MachineInstr &MI) const; + bool isCFAluEnabled(const MachineInstr &MI) const; /// IfCvt pass can generate "disabled" ALU clause marker that need to be /// removed and their content affected to the previous alu clause. /// This function parse instructions after CFAlu until it find a disabled /// CFAlu and merge the content, or an enabled CFAlu. - void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const; + void cleanPotentialDisabledCFAlu(MachineInstr &CFAlu) const; /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if /// it is the case. - bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu) - const; + bool mergeIfPossible(MachineInstr &RootCFAlu, + const MachineInstr &LatrCFAlu) const; public: R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { } @@ -71,38 +71,40 @@ public: char R600ClauseMergePass::ID = 0; -unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const { +unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const { assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm(); + return MI + .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT)) + .getImm(); } -bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const { +bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const { assert(isCFAlu(MI)); - return MI->getOperand( - TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm(); + return MI + .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled)) + .getImm(); } -void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) - const { +void R600ClauseMergePass::cleanPotentialDisabledCFAlu( + MachineInstr &CFAlu) const { int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); - MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end(); + MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end(); I++; do { - while (I!= E && !isCFAlu(I)) + while (I != E && !isCFAlu(*I)) I++; if (I == E) return; - MachineInstr *MI = I++; + MachineInstr &MI = *I++; if (isCFAluEnabled(MI)) break; - CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); - MI->eraseFromParent(); + CFAlu.getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI)); + MI.eraseFromParent(); } while (I != E); } -bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, - const MachineInstr *LatrCFAlu) const { +bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu, + const MachineInstr &LatrCFAlu) const { assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); unsigned RootInstCount = getCFAluSize(RootCFAlu), @@ -112,7 +114,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, DEBUG(dbgs() << "Excess inst counts\n"); return false; } - if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) return false; // Is KCache Bank 0 compatible ? int Mode0Idx = @@ -121,12 +123,12 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); int KBank0LineIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); - if (LatrCFAlu->getOperand(Mode0Idx).getImm() && - RootCFAlu->getOperand(Mode0Idx).getImm() && - (LatrCFAlu->getOperand(KBank0Idx).getImm() != - RootCFAlu->getOperand(KBank0Idx).getImm() || - LatrCFAlu->getOperand(KBank0LineIdx).getImm() != - RootCFAlu->getOperand(KBank0LineIdx).getImm())) { + if (LatrCFAlu.getOperand(Mode0Idx).getImm() && + RootCFAlu.getOperand(Mode0Idx).getImm() && + (LatrCFAlu.getOperand(KBank0Idx).getImm() != + RootCFAlu.getOperand(KBank0Idx).getImm() || + LatrCFAlu.getOperand(KBank0LineIdx).getImm() != + RootCFAlu.getOperand(KBank0LineIdx).getImm())) { DEBUG(dbgs() << "Wrong KC0\n"); return false; } @@ -137,56 +139,61 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu, TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); int KBank1LineIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); - if (LatrCFAlu->getOperand(Mode1Idx).getImm() && - RootCFAlu->getOperand(Mode1Idx).getImm() && - (LatrCFAlu->getOperand(KBank1Idx).getImm() != - RootCFAlu->getOperand(KBank1Idx).getImm() || - LatrCFAlu->getOperand(KBank1LineIdx).getImm() != - RootCFAlu->getOperand(KBank1LineIdx).getImm())) { + if (LatrCFAlu.getOperand(Mode1Idx).getImm() && + RootCFAlu.getOperand(Mode1Idx).getImm() && + (LatrCFAlu.getOperand(KBank1Idx).getImm() != + RootCFAlu.getOperand(KBank1Idx).getImm() || + LatrCFAlu.getOperand(KBank1LineIdx).getImm() != + RootCFAlu.getOperand(KBank1LineIdx).getImm())) { DEBUG(dbgs() << "Wrong KC0\n"); return false; } - if (LatrCFAlu->getOperand(Mode0Idx).getImm()) { - RootCFAlu->getOperand(Mode0Idx).setImm( - LatrCFAlu->getOperand(Mode0Idx).getImm()); - RootCFAlu->getOperand(KBank0Idx).setImm( - LatrCFAlu->getOperand(KBank0Idx).getImm()); - RootCFAlu->getOperand(KBank0LineIdx).setImm( - LatrCFAlu->getOperand(KBank0LineIdx).getImm()); + if (LatrCFAlu.getOperand(Mode0Idx).getImm()) { + RootCFAlu.getOperand(Mode0Idx).setImm( + LatrCFAlu.getOperand(Mode0Idx).getImm()); + RootCFAlu.getOperand(KBank0Idx).setImm( + LatrCFAlu.getOperand(KBank0Idx).getImm()); + RootCFAlu.getOperand(KBank0LineIdx) + .setImm(LatrCFAlu.getOperand(KBank0LineIdx).getImm()); } - if (LatrCFAlu->getOperand(Mode1Idx).getImm()) { - RootCFAlu->getOperand(Mode1Idx).setImm( - LatrCFAlu->getOperand(Mode1Idx).getImm()); - RootCFAlu->getOperand(KBank1Idx).setImm( - LatrCFAlu->getOperand(KBank1Idx).getImm()); - RootCFAlu->getOperand(KBank1LineIdx).setImm( - LatrCFAlu->getOperand(KBank1LineIdx).getImm()); + if (LatrCFAlu.getOperand(Mode1Idx).getImm()) { + RootCFAlu.getOperand(Mode1Idx).setImm( + LatrCFAlu.getOperand(Mode1Idx).getImm()); + RootCFAlu.getOperand(KBank1Idx).setImm( + LatrCFAlu.getOperand(KBank1Idx).getImm()); + RootCFAlu.getOperand(KBank1LineIdx) + .setImm(LatrCFAlu.getOperand(KBank1LineIdx).getImm()); } - RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts); - RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode())); + RootCFAlu.getOperand(CntIdx).setImm(CumuledInsts); + RootCFAlu.setDesc(TII->get(LatrCFAlu.getOpcode())); return true; } bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + if (skipFunction(*MF.getFunction())) + return false; + + const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); + TII = ST.getInstrInfo(); + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); MachineBasicBlock::iterator LatestCFAlu = E; while (I != E) { - MachineInstr *MI = I++; + MachineInstr &MI = *I++; if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) || - TII->mustBeLastInClause(MI->getOpcode())) + TII->mustBeLastInClause(MI.getOpcode())) LatestCFAlu = E; if (!isCFAlu(MI)) continue; cleanPotentialDisabledCFAlu(MI); - if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) { - MI->eraseFromParent(); + if (LatestCFAlu != E && mergeIfPossible(*LatestCFAlu, MI)) { + MI.eraseFromParent(); } else { - assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled"); + assert(MI.getOperand(8).getImm() && "CF ALU instruction disabled"); LatestCFAlu = MI; } } diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index bd80bb211b4f..d5bda4a8303e 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -39,16 +39,16 @@ struct CFStack { FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 }; - const AMDGPUSubtarget *ST; + const R600Subtarget *ST; std::vector<StackItem> BranchStack; std::vector<StackItem> LoopStack; unsigned MaxStackSize; unsigned CurrentEntries; unsigned CurrentSubEntries; - CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), + CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. - MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0), CurrentEntries(0), CurrentSubEntries(0) { } unsigned getLoopDepth(); @@ -119,7 +119,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 0; case CFStack::FIRST_NON_WQM_PUSH: assert(!ST->hasCaymanISA()); - if (ST->getGeneration() <= AMDGPUSubtarget::R700) { + if (ST->getGeneration() <= R600Subtarget::R700) { // +1 For the push operation. // +2 Extra space required. return 3; @@ -132,7 +132,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { return 2; } case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: - assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + assert(ST->getGeneration() >= R600Subtarget::EVERGREEN); // +1 For the push operation. // +1 Extra space required. return 2; @@ -142,8 +142,8 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { } void CFStack::updateMaxStackSize() { - unsigned CurrentStackSize = CurrentEntries + - (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + unsigned CurrentStackSize = + CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4); MaxStackSize = std::max(CurrentStackSize, MaxStackSize); } @@ -159,7 +159,7 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) { // See comment in // CFStack::getSubEntrySize() else if (CurrentEntries > 0 && - ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && + ST->getGeneration() > R600Subtarget::EVERGREEN && !ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; @@ -220,10 +220,10 @@ private: const R600InstrInfo *TII; const R600RegisterInfo *TRI; unsigned MaxFetchInst; - const AMDGPUSubtarget *ST; + const R600Subtarget *ST; - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { + bool IsTrivialInst(MachineInstr &MI) const { + switch (MI.getOpcode()) { case AMDGPU::KILL: case AMDGPU::RETURN: return true; @@ -234,7 +234,7 @@ private: const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { unsigned Opcode = 0; - bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); + bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN); switch (CFI) { case CF_TC: Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; @@ -278,11 +278,12 @@ private: return TII->get(Opcode); } - bool isCompatibleWithClause(const MachineInstr *MI, - std::set<unsigned> &DstRegs) const { + bool isCompatibleWithClause(const MachineInstr &MI, + std::set<unsigned> &DstRegs) const { unsigned DstMI, SrcMI; - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { + for (MachineInstr::const_mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); + I != E; ++I) { const MachineOperand &MO = *I; if (!MO.isReg()) continue; @@ -318,20 +319,20 @@ private: MachineBasicBlock::iterator ClauseHead = I; std::vector<MachineInstr *> ClauseContent; unsigned AluInstCount = 0; - bool IsTex = TII->usesTextureCache(ClauseHead); + bool IsTex = TII->usesTextureCache(*ClauseHead); std::set<unsigned> DstRegs; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) + if (IsTrivialInst(*I)) continue; if (AluInstCount >= MaxFetchInst) break; - if ((IsTex && !TII->usesTextureCache(I)) || - (!IsTex && !TII->usesVertexCache(I))) + if ((IsTex && !TII->usesTextureCache(*I)) || + (!IsTex && !TII->usesVertexCache(*I))) break; - if (!isCompatibleWithClause(I, DstRegs)) + if (!isCompatibleWithClause(*I, DstRegs)) break; AluInstCount ++; - ClauseContent.push_back(I); + ClauseContent.push_back(&*I); } MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), getHWInstrDesc(IsTex?CF_TC:CF_VC)) @@ -340,28 +341,37 @@ private: return ClauseFile(MIb, std::move(ClauseContent)); } - void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { + void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const { static const unsigned LiteralRegs[] = { AMDGPU::ALU_LITERAL_X, AMDGPU::ALU_LITERAL_Y, AMDGPU::ALU_LITERAL_Z, AMDGPU::ALU_LITERAL_W }; - const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = + const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = TII->getSrcs(MI); - for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { - if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) + for (const auto &Src:Srcs) { + if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X) continue; - int64_t Imm = Srcs[i].second; - std::vector<int64_t>::iterator It = - std::find(Lits.begin(), Lits.end(), Imm); + int64_t Imm = Src.second; + std::vector<MachineOperand*>::iterator It = + std::find_if(Lits.begin(), Lits.end(), + [&](MachineOperand* val) + { return val->isImm() && (val->getImm() == Imm);}); + + // Get corresponding Operand + MachineOperand &Operand = MI.getOperand( + TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + if (It != Lits.end()) { + // Reuse existing literal reg unsigned Index = It - Lits.begin(); - Srcs[i].first->setReg(LiteralRegs[Index]); + Src.first->setReg(LiteralRegs[Index]); } else { + // Allocate new literal reg assert(Lits.size() < 4 && "Too many literals in Instruction Group"); - Srcs[i].first->setReg(LiteralRegs[Lits.size()]); - Lits.push_back(Imm); + Src.first->setReg(LiteralRegs[Lits.size()]); + Lits.push_back(&Operand); } } } @@ -384,56 +394,66 @@ private: ClauseFile MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) const { - MachineBasicBlock::iterator ClauseHead = I; + MachineInstr &ClauseHead = *I; std::vector<MachineInstr *> ClauseContent; I++; for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { - if (IsTrivialInst(I)) { + if (IsTrivialInst(*I)) { ++I; continue; } if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) break; - std::vector<int64_t> Literals; + std::vector<MachineOperand *>Literals; if (I->isBundle()) { - MachineInstr *DeleteMI = I; + MachineInstr &DeleteMI = *I; MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); while (++BI != E && BI->isBundledWithPred()) { BI->unbundleFromPred(); - for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = BI->getOperand(i); + for (MachineOperand &MO : BI->operands()) { if (MO.isReg() && MO.isInternalRead()) MO.setIsInternalRead(false); } - getLiteral(&*BI, Literals); + getLiteral(*BI, Literals); ClauseContent.push_back(&*BI); } I = BI; - DeleteMI->eraseFromParent(); + DeleteMI.eraseFromParent(); } else { - getLiteral(I, Literals); - ClauseContent.push_back(I); + getLiteral(*I, Literals); + ClauseContent.push_back(&*I); I++; } - for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { - unsigned literal0 = Literals[i]; - unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; - MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) - .addImm(literal0) - .addImm(literal2); + for (unsigned i = 0, e = Literals.size(); i < e; i += 2) { + MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(), + TII->get(AMDGPU::LITERALS)); + if (Literals[i]->isImm()) { + MILit.addImm(Literals[i]->getImm()); + } else { + MILit.addGlobalAddress(Literals[i]->getGlobal(), + Literals[i]->getOffset()); + } + if (i + 1 < e) { + if (Literals[i + 1]->isImm()) { + MILit.addImm(Literals[i + 1]->getImm()); + } else { + MILit.addGlobalAddress(Literals[i + 1]->getGlobal(), + Literals[i + 1]->getOffset()); + } + } else + MILit.addImm(0); ClauseContent.push_back(MILit); } } assert(ClauseContent.size() < 128 && "ALU clause is too big"); - ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); - return ClauseFile(ClauseHead, std::move(ClauseContent)); + ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1); + return ClauseFile(&ClauseHead, std::move(ClauseContent)); } void EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { - CounterPropagateAddr(Clause.first, CfCount); + CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) .addImm(CfCount); @@ -447,7 +467,7 @@ private: EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { Clause.first->getOperand(0).setImm(0); - CounterPropagateAddr(Clause.first, CfCount); + CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) .addImm(CfCount); @@ -457,13 +477,13 @@ private: CfCount += Clause.second.size(); } - void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { - MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); + void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const { + MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm()); } void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, unsigned Addr) const { for (MachineInstr *MI : MIs) { - CounterPropagateAddr(MI, Addr); + CounterPropagateAddr(*MI, Addr); } } @@ -472,20 +492,21 @@ public: : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override { - ST = &MF.getSubtarget<AMDGPUSubtarget>(); + ST = &MF.getSubtarget<R600Subtarget>(); MaxFetchInst = ST->getTexVTXClauseSize(); - TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo()); - TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo()); + TII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - CFStack CFStack(ST, MFI->getShaderType()); + CFStack CFStack(ST, MF.getFunction()->getCallingConv()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; std::vector<MachineInstr * > IfThenElseStack; - if (MFI->getShaderType() == ShaderType::VERTEX) { + if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; @@ -493,10 +514,10 @@ public: std::vector<ClauseFile> FetchClauses, AluClauses; std::vector<MachineInstr *> LastAlu(1); std::vector<MachineInstr *> ToPopAfter; - + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { + if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) { DEBUG(dbgs() << CfCount << ":"; I->dump();); FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; @@ -508,7 +529,7 @@ public: if (MI->getOpcode() != AMDGPU::ENDIF) LastAlu.back() = nullptr; if (MI->getOpcode() == AMDGPU::CF_ALU) - LastAlu.back() = MI; + LastAlu.back() = &*MI; I++; bool RequiresWorkAround = CFStack.requiresWorkAroundForInst(MI->getOpcode()); @@ -571,7 +592,7 @@ public: case AMDGPU::ELSE: { MachineInstr * JumpInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); - CounterPropagateAddr(JumpInst, CfCount); + CounterPropagateAddr(*JumpInst, CfCount); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_ELSE)) .addImm(0) @@ -595,10 +616,10 @@ public: DEBUG(dbgs() << CfCount << ":"; MIb->dump();); CfCount++; } - + MachineInstr *IfOrElseInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); - CounterPropagateAddr(IfOrElseInst, CfCount); + CounterPropagateAddr(*IfOrElseInst, CfCount); IfOrElseInst->getOperand(1).setImm(1); LastAlu.pop_back(); MI->eraseFromParent(); @@ -625,15 +646,16 @@ public: case AMDGPU::RETURN: { BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); CfCount++; - MI->eraseFromParent(); if (CfCount % 2) { BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); CfCount++; } + MI->eraseFromParent(); for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) EmitFetchClause(I, FetchClauses[i], CfCount); for (unsigned i = 0, e = AluClauses.size(); i < e; i++) EmitALUClause(I, AluClauses[i], CfCount); + break; } default: if (TII->isExport(MI->getOpcode())) { diff --git a/lib/Target/AMDGPU/R600Defines.h b/lib/Target/AMDGPU/R600Defines.h index 51d87eda31d1..534461adc59f 100644 --- a/lib/Target/AMDGPU/R600Defines.h +++ b/lib/Target/AMDGPU/R600Defines.h @@ -8,8 +8,8 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600DEFINES_H -#define LLVM_LIB_TARGET_R600_R600DEFINES_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H +#define LLVM_LIB_TARGET_AMDGPU_R600DEFINES_H #include "llvm/MC/MCRegisterInfo.h" diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index fdc20302f4a3..93ed5be94a54 100644 --- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -38,8 +38,8 @@ private: const R600InstrInfo *TII; int Address; - unsigned OccupiedDwords(MachineInstr *MI) const { - switch (MI->getOpcode()) { + unsigned OccupiedDwords(MachineInstr &MI) const { + switch (MI.getOpcode()) { case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: @@ -53,17 +53,17 @@ private: // These will be expanded to two ALU instructions in the // ExpandSpecialInstructions pass. - if (TII->isLDSRetInstr(MI->getOpcode())) + if (TII->isLDSRetInstr(MI.getOpcode())) return 2; - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode())) + if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode()) || + TII->isReductionOp(MI.getOpcode())) return 4; unsigned NumLiteral = 0; - for (MachineInstr::mop_iterator It = MI->operands_begin(), - E = MI->operands_end(); It != E; ++It) { + for (MachineInstr::mop_iterator It = MI.operands_begin(), + E = MI.operands_end(); + It != E; ++It) { MachineOperand &MO = *It; if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) ++NumLiteral; @@ -71,12 +71,12 @@ private: return 1 + NumLiteral; } - bool isALU(const MachineInstr *MI) const { - if (TII->isALUInstr(MI->getOpcode())) + bool isALU(const MachineInstr &MI) const { + if (TII->isALUInstr(MI.getOpcode())) return true; - if (TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode())) + if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode())) return true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::PRED_X: case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: @@ -89,8 +89,8 @@ private: } } - bool IsTrivialInst(MachineInstr *MI) const { - switch (MI->getOpcode()) { + bool IsTrivialInst(MachineInstr &MI) const { + switch (MI.getOpcode()) { case AMDGPU::KILL: case AMDGPU::RETURN: case AMDGPU::IMPLICIT_DEF: @@ -114,18 +114,20 @@ private: ((((Sel >> 2) - 512) & 4095) >> 5) << 1); } - bool SubstituteKCacheBank(MachineInstr *MI, - std::vector<std::pair<unsigned, unsigned> > &CachedConsts, - bool UpdateInstr = true) const { + bool + SubstituteKCacheBank(MachineInstr &MI, + std::vector<std::pair<unsigned, unsigned>> &CachedConsts, + bool UpdateInstr = true) const { std::vector<std::pair<unsigned, unsigned> > UsedKCache; - if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) + if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4) return true; - const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = + const SmallVectorImpl<std::pair<MachineOperand *, int64_t>> &Consts = TII->getSrcs(MI); - assert((TII->isALUInstr(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); + assert( + (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) && + "Can't assign Const"); for (unsigned i = 0, n = Consts.size(); i < n; ++i) { if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) continue; @@ -194,9 +196,9 @@ private: // in the clause. unsigned LastUseCount = 0; for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) { - AluInstCount += OccupiedDwords(UseI); + AluInstCount += OccupiedDwords(*UseI); // Make sure we won't need to end the clause due to KCache limitations. - if (!SubstituteKCacheBank(UseI, KCacheBanks, false)) + if (!SubstituteKCacheBank(*UseI, KCacheBanks, false)) return false; // We have reached the maximum instruction limit before finding the @@ -230,9 +232,9 @@ private: bool PushBeforeModifier = false; unsigned AluInstCount = 0; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { - if (IsTrivialInst(I)) + if (IsTrivialInst(*I)) continue; - if (!isALU(I)) + if (!isALU(*I)) break; if (AluInstCount > TII->getMaxAlusPerClause()) break; @@ -245,7 +247,7 @@ private: // clause as predicated alus). if (AluInstCount > 0) break; - if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH) + if (TII->getFlagOp(*I).getImm() & MO_FLAG_PUSH) PushBeforeModifier = true; AluInstCount ++; continue; @@ -267,16 +269,16 @@ private: if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E)) break; - if (!SubstituteKCacheBank(I, KCacheBanks)) + if (!SubstituteKCacheBank(*I, KCacheBanks)) break; - AluInstCount += OccupiedDwords(I); + AluInstCount += OccupiedDwords(*I); } unsigned Opcode = PushBeforeModifier ? AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) // We don't use the ADDR field until R600ControlFlowFinalizer pass, where // it is safe to assume it is 0. However if we always put 0 here, the ifcvt - // pass may assume that identical ALU clause starter at the beginning of a + // pass may assume that identical ALU clause starter at the beginning of a // true and false branch can be factorized which is not the case. .addImm(Address++) // ADDR .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 @@ -298,7 +300,8 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); + TII = ST.getInstrInfo(); for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); BB != BB_E; ++BB) { @@ -307,7 +310,7 @@ public: if (I->getOpcode() == AMDGPU::CF_ALU) continue; // BB was already parsed for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { - if (isALU(I)) + if (isALU(*I)) I = MakeALUClause(MBB, I); else ++I; diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp index 211d392e8fcc..0385b6283f37 100644 --- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -29,7 +29,6 @@ using namespace llvm; namespace { class R600ExpandSpecialInstrsPass : public MachineFunctionPass { - private: static char ID; const R600InstrInfo *TII; @@ -61,12 +60,13 @@ void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, int OpIdx = TII->getOperandIdx(*OldMI, Op); if (OpIdx > -1) { uint64_t Val = OldMI->getOperand(OpIdx).getImm(); - TII->setImmOperand(NewMI, Op, Val); + TII->setImmOperand(*NewMI, Op, Val); } } bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); + TII = ST.getInstrInfo(); const R600RegisterInfo &TRI = TII->getRegisterInfo(); @@ -107,11 +107,11 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.getOperand(0).getReg(), // dst MI.getOperand(1).getReg(), // src0 AMDGPU::ZERO); // src1 - TII->addFlag(PredSet, 0, MO_FLAG_MASK); + TII->addFlag(*PredSet, 0, MO_FLAG_MASK); if (Flags & MO_FLAG_PUSH) { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); + TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1); } else { - TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); + TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1); } MI.eraseFromParent(); continue; @@ -137,9 +137,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Chan >= 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); + TII->addFlag(*BMI, 0, MO_FLAG_MASK); if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); @@ -166,9 +166,9 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Chan < 2) - TII->addFlag(BMI, 0, MO_FLAG_MASK); + TII->addFlag(*BMI, 0, MO_FLAG_MASK); if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); @@ -189,7 +189,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); } MI.eraseFromParent(); @@ -212,10 +212,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { BMI->bundleWithPred(); } if (Mask) { - TII->addFlag(BMI, 0, MO_FLAG_MASK); + TII->addFlag(*BMI, 0, MO_FLAG_MASK); } if (Chan != 3) - TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*BMI, 0, MO_FLAG_NOT_LAST); unsigned Opcode = BMI->getOpcode(); // While not strictly necessary from hw point of view, we force // all src operands of a dot4 inst to belong to the same slot. @@ -330,10 +330,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { if (Chan != 0) NewMI->bundleWithPred(); if (Mask) { - TII->addFlag(NewMI, 0, MO_FLAG_MASK); + TII->addFlag(*NewMI, 0, MO_FLAG_MASK); } if (NotLast) { - TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); + TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST); } SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp new file mode 100644 index 000000000000..dd5681ff5e8b --- /dev/null +++ b/lib/Target/AMDGPU/R600FrameLowering.cpp @@ -0,0 +1,15 @@ +//===----------------------- R600FrameLowering.cpp ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#include "R600FrameLowering.h" + +using namespace llvm; + +R600FrameLowering::~R600FrameLowering() { +} diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h new file mode 100644 index 000000000000..5fe4e0d201ac --- /dev/null +++ b/lib/Target/AMDGPU/R600FrameLowering.h @@ -0,0 +1,30 @@ +//===--------------------- R600FrameLowering.h ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_R600FRAMELOWERING_H + +#include "AMDGPUFrameLowering.h" + +namespace llvm { + +class R600FrameLowering : public AMDGPUFrameLowering { +public: + R600FrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1) : + AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + virtual ~R600FrameLowering(); + + void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {} + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {} +}; + +} + +#endif diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 124a9c6e0f56..8f78edd76a51 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -30,18 +30,61 @@ using namespace llvm; -R600TargetLowering::R600TargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) +R600TargetLowering::R600TargetLowering(const TargetMachine &TM, + const R600Subtarget &STI) : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { - addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); + addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); computeRegisterProperties(STI.getRegisterInfo()); + // Legalize loads and stores to the private address space. + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + + // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address + // spaces, so it is custom lowered to handle those where it isn't. + for (MVT VT : MVT::integer_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); + } + + // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); + + + setOperationAction(ISD::STORE, MVT::i8, Custom); + setOperationAction(ISD::STORE, MVT::i32, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); + + setTruncStoreAction(MVT::i32, MVT::i8, Custom); + setTruncStoreAction(MVT::i32, MVT::i16, Custom); + + // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. + setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); + setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); + // Set condition code actions setCondCodeAction(ISD::SETO, MVT::f32, Expand); setCondCodeAction(ISD::SETUO, MVT::f32, Expand); @@ -73,10 +116,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::f32, Expand); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -122,37 +161,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); - - // Legalize loads and stores to the private address space. - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v2i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); - - // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address - // spaces, so it is custom lowered to handle those where it isn't. - for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); - } - - setOperationAction(ISD::STORE, MVT::i8, Custom); - setOperationAction(ISD::STORE, MVT::i32, Custom); - setOperationAction(ISD::STORE, MVT::v2i32, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); - setTruncStoreAction(MVT::i32, MVT::i8, Custom); - setTruncStoreAction(MVT::i32, MVT::i16, Custom); - - setOperationAction(ISD::LOAD, MVT::i32, Custom); - setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); @@ -165,12 +173,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); - setTargetDAGCombine(ISD::FP_ROUND); - setTargetDAGCombine(ISD::FP_TO_SINT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); - setTargetDAGCombine(ISD::SELECT_CC); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 // to be Legal/Custom in order to avoid library calls. setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); @@ -188,119 +190,138 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, } setSchedulingPreference(Sched::Source); + + + setTargetDAGCombine(ISD::FP_ROUND); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::SELECT_CC); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); +} + +const R600Subtarget *R600TargetLowering::getSubtarget() const { + return static_cast<const R600Subtarget *>(Subtarget); } static inline bool isEOP(MachineBasicBlock::iterator I) { return std::next(I)->getOpcode() == AMDGPU::RETURN; } -MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { +MachineBasicBlock * +R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const { MachineFunction * MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - MachineBasicBlock::iterator I = *MI; - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); + MachineBasicBlock::iterator I = MI; + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: // Replace LDS_*_RET instruction that don't have any uses with the // equivalent LDS_*_NORET instruction. - if (TII->isLDSRetInstr(MI->getOpcode())) { - int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + if (TII->isLDSRetInstr(MI.getOpcode())) { + int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add // LDS_1A2D support and remove this special case. - if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) || - MI->getOpcode() == AMDGPU::LDS_CMPST_RET) + if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || + MI.getOpcode() == AMDGPU::LDS_CMPST_RET) return BB; NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); - for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { - NewMI.addOperand(MI->getOperand(i)); + TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); + for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { + NewMI.addOperand(MI.getOperand(i)); } } else { return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } break; case AMDGPU::CLAMP_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP); break; } case AMDGPU::FABS_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_ABS); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_ABS); break; } case AMDGPU::FNEG_R600: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, - AMDGPU::MOV, - MI->getOperand(0).getReg(), - MI->getOperand(1).getReg()); - TII->addFlag(NewMI, 0, MO_FLAG_NEG); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + MI.getOperand(1).getReg()); + TII->addFlag(*NewMI, 0, MO_FLAG_NEG); break; } case AMDGPU::MASK_WRITE: { - unsigned maskedRegister = MI->getOperand(0).getReg(); + unsigned maskedRegister = MI.getOperand(0).getReg(); assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); - TII->addFlag(defInstr, 0, MO_FLAG_MASK); + TII->addFlag(*defInstr, 0, MO_FLAG_MASK); break; } case AMDGPU::MOV_IMM_F32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getFPImm()->getValueAPF() - .bitcastToAPInt().getZExtValue()); + TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) + .getFPImm() + ->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); break; case AMDGPU::MOV_IMM_I32: - TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), - MI->getOperand(1).getImm()); + TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), + MI.getOperand(1).getImm()); break; + case AMDGPU::MOV_IMM_GLOBAL_ADDR: { + //TODO: Perhaps combine this instruction with the next if possible + auto MIB = TII->buildDefaultInstruction( + *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X); + int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal); + //TODO: Ugh this is rather ugly + MIB->getOperand(Idx) = MI.getOperand(1); + break; + } case AMDGPU::CONST_COPY: { - MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, - MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, - MI->getOperand(1).getImm()); + MachineInstr *NewMI = TII->buildDefaultInstruction( + *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); + TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel, + MI.getOperand(1).getImm()); break; } case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(isEOP(I)); // Set End of program bit + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(isEOP(I)); // Set End of program bit break; } case AMDGPU::RAT_STORE_TYPED_eg: { - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addImm(isEOP(I)); // Set End of program bit + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addImm(isEOP(I)); // Set End of program bit break; } case AMDGPU::TXD: { unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); + MachineOperand &RID = MI.getOperand(4); + MachineOperand &SID = MI.getOperand(5); + unsigned TextureId = MI.getOperand(6).getImm(); unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; @@ -333,75 +354,77 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( CTZ = 0; break; } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), + T0) + .addOperand(MI.getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), + T1) + .addOperand(MI.getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); break; } case AMDGPU::TXD_SHADOW: { unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); - MachineOperand &RID = MI->getOperand(4); - MachineOperand &SID = MI->getOperand(5); - unsigned TextureId = MI->getOperand(6).getImm(); + MachineOperand &RID = MI.getOperand(4); + MachineOperand &SID = MI.getOperand(5); + unsigned TextureId = MI.getOperand(6).getImm(); unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; @@ -435,99 +458,101 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( break; } - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) - .addOperand(MI->getOperand(3)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) - .addOperand(MI->getOperand(2)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), + T0) + .addOperand(MI.getOperand(3)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), + T1) + .addOperand(MI.getOperand(2)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addImm(SrcX) - .addImm(SrcY) - .addImm(SrcZ) - .addImm(SrcW) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(0) - .addImm(1) - .addImm(2) - .addImm(3) - .addOperand(RID) - .addOperand(SID) - .addImm(CTX) - .addImm(CTY) - .addImm(CTZ) - .addImm(CTW) - .addReg(T0, RegState::Implicit) - .addReg(T1, RegState::Implicit); + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addImm(SrcX) + .addImm(SrcY) + .addImm(SrcZ) + .addImm(SrcW) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(1) + .addImm(2) + .addImm(3) + .addOperand(RID) + .addOperand(SID) + .addImm(CTX) + .addImm(CTY) + .addImm(CTZ) + .addImm(CTW) + .addReg(T0, RegState::Implicit) + .addReg(T1, RegState::Implicit); break; } case AMDGPU::BRANCH: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) - .addOperand(MI->getOperand(0)); - break; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + .addOperand(MI.getOperand(0)); + break; case AMDGPU::BRANCH_COND_f32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) - .addImm(OPCODE_IS_NOT_ZERO) - .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI.getOperand(1)) + .addImm(OPCODE_IS_NOT_ZERO) + .addImm(0); // Flags + TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addOperand(MI.getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } case AMDGPU::BRANCH_COND_i32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) - .addOperand(MI->getOperand(1)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), + AMDGPU::PREDICATE_BIT) + .addOperand(MI.getOperand(1)) .addImm(OPCODE_IS_NOT_ZERO_INT) .addImm(0); // Flags - TII->addFlag(NewMI, 0, MO_FLAG_PUSH); + TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) - .addOperand(MI->getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addOperand(MI.getOperand(0)) + .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); break; } @@ -535,7 +560,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::R600_ExportSwz: { // Instruction is left unmodified if its not the last one of its type bool isLastInstructionOfItsType = true; - unsigned InstExportType = MI->getOperand(1).getImm(); + unsigned InstExportType = MI.getOperand(1).getImm(); for (MachineBasicBlock::iterator NextExportInst = std::next(I), EndBlock = BB->end(); NextExportInst != EndBlock; NextExportInst = std::next(NextExportInst)) { @@ -552,17 +577,17 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; - unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) - .addOperand(MI->getOperand(0)) - .addOperand(MI->getOperand(1)) - .addOperand(MI->getOperand(2)) - .addOperand(MI->getOperand(3)) - .addOperand(MI->getOperand(4)) - .addOperand(MI->getOperand(5)) - .addOperand(MI->getOperand(6)) - .addImm(CfInst) - .addImm(EOP); + unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(3)) + .addOperand(MI.getOperand(4)) + .addOperand(MI.getOperand(5)) + .addOperand(MI.getOperand(6)) + .addImm(CfInst) + .addImm(EOP); break; } case AMDGPU::RETURN: { @@ -576,7 +601,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } } - MI->eraseFromParent(); + MI.eraseFromParent(); return BB; } @@ -610,18 +635,13 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); + case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); case ISD::INTRINSIC_VOID: { SDValue Chain = Op.getOperand(0); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case AMDGPUIntrinsic::AMDGPU_store_output: { - int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MFI->LiveOuts.push_back(Reg); - return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); - } - case AMDGPUIntrinsic::R600_store_swizzle: { + case AMDGPUIntrinsic::r600_store_swizzle: { SDLoc DL(Op); const SDValue Args[8] = { Chain, @@ -649,114 +669,48 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const SDLoc DL(Op); switch(IntrinsicID) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case AMDGPUIntrinsic::R600_load_input: { - int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MRI.addLiveIn(Reg); - return DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), Reg, VT); - } - - case AMDGPUIntrinsic::R600_interp_input: { - int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); - MachineSDNode *interp; - if (ijb < 0) { - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo()); - interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, - MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32)); - return DAG.getTargetExtractSubreg( - TII->getRegisterInfo().getSubRegFromChannel(slot % 4), - DL, MVT::f32, SDValue(interp, 0)); - } - MachineFunction &MF = DAG.getMachineFunction(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); - unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); - MRI.addLiveIn(RegisterI); - MRI.addLiveIn(RegisterJ); - SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); - SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), - SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); - - if (slot % 4 < 2) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32), - RegisterJNode, RegisterINode); - return SDValue(interp, slot % 2); - } - case AMDGPUIntrinsic::R600_interp_xy: - case AMDGPUIntrinsic::R600_interp_zw: { - int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - MachineSDNode *interp; - SDValue RegisterINode = Op.getOperand(2); - SDValue RegisterJNode = Op.getOperand(3); - - if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - else - interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, - MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32), - RegisterJNode, RegisterINode); - return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, - SDValue(interp, 0), SDValue(interp, 1)); - } - case AMDGPUIntrinsic::R600_tex: - case AMDGPUIntrinsic::R600_texc: - case AMDGPUIntrinsic::R600_txl: - case AMDGPUIntrinsic::R600_txlc: - case AMDGPUIntrinsic::R600_txb: - case AMDGPUIntrinsic::R600_txbc: - case AMDGPUIntrinsic::R600_txf: - case AMDGPUIntrinsic::R600_txq: - case AMDGPUIntrinsic::R600_ddx: - case AMDGPUIntrinsic::R600_ddy: - case AMDGPUIntrinsic::R600_ldptr: { + case AMDGPUIntrinsic::r600_tex: + case AMDGPUIntrinsic::r600_texc: + case AMDGPUIntrinsic::r600_txl: + case AMDGPUIntrinsic::r600_txlc: + case AMDGPUIntrinsic::r600_txb: + case AMDGPUIntrinsic::r600_txbc: + case AMDGPUIntrinsic::r600_txf: + case AMDGPUIntrinsic::r600_txq: + case AMDGPUIntrinsic::r600_ddx: + case AMDGPUIntrinsic::r600_ddy: { unsigned TextureOp; switch (IntrinsicID) { - case AMDGPUIntrinsic::R600_tex: + case AMDGPUIntrinsic::r600_tex: TextureOp = 0; break; - case AMDGPUIntrinsic::R600_texc: + case AMDGPUIntrinsic::r600_texc: TextureOp = 1; break; - case AMDGPUIntrinsic::R600_txl: + case AMDGPUIntrinsic::r600_txl: TextureOp = 2; break; - case AMDGPUIntrinsic::R600_txlc: + case AMDGPUIntrinsic::r600_txlc: TextureOp = 3; break; - case AMDGPUIntrinsic::R600_txb: + case AMDGPUIntrinsic::r600_txb: TextureOp = 4; break; - case AMDGPUIntrinsic::R600_txbc: + case AMDGPUIntrinsic::r600_txbc: TextureOp = 5; break; - case AMDGPUIntrinsic::R600_txf: + case AMDGPUIntrinsic::r600_txf: TextureOp = 6; break; - case AMDGPUIntrinsic::R600_txq: + case AMDGPUIntrinsic::r600_txq: TextureOp = 7; break; - case AMDGPUIntrinsic::R600_ddx: + case AMDGPUIntrinsic::r600_ddx: TextureOp = 8; break; - case AMDGPUIntrinsic::R600_ddy: + case AMDGPUIntrinsic::r600_ddy: TextureOp = 9; break; - case AMDGPUIntrinsic::R600_ldptr: - TextureOp = 10; - break; default: llvm_unreachable("Unknow Texture Operation"); } @@ -784,7 +738,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const }; return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); } - case AMDGPUIntrinsic::AMDGPU_dp4: { + case AMDGPUIntrinsic::r600_dot4: { SDValue Args[8] = { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), DAG.getConstant(0, DL, MVT::i32)), @@ -806,6 +760,11 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); } + case Intrinsic::r600_implicitarg_ptr: { + MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); + uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return DAG.getConstant(ByteOffset, DL, PtrVT); + } case Intrinsic::r600_read_ngroups_x: return LowerImplicitParameter(DAG, VT, DL, 0); case Intrinsic::r600_read_ngroups_y: @@ -825,7 +784,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_local_size_z: return LowerImplicitParameter(DAG, VT, DL, 8); - case Intrinsic::AMDGPU_read_workdim: { + case Intrinsic::r600_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); } @@ -848,14 +808,14 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); - case Intrinsic::AMDGPU_rsq: - // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + case Intrinsic::r600_recipsqrt_ieee: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + + case Intrinsic::r600_recipsqrt_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; } @@ -950,6 +910,22 @@ SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, return vectorToVerticalVector(DAG, Insert); } +SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, + SDValue Op, + SelectionDAG &DAG) const { + + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); + + const DataLayout &DL = DAG.getDataLayout(); + const GlobalValue *GV = GSD->getGlobal(); + MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + + SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); +} + SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { // On hw >= R700, COS/SIN input must be between -1. and 1. // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) @@ -977,7 +953,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, DAG.getNode(ISD::FADD, DL, VT, FractPart, DAG.getConstantFP(-0.5, DL, MVT::f32))); - if (Gen >= AMDGPUSubtarget::R700) + if (Gen >= R600Subtarget::R700) return TrigVal; // On R600 hw, COS/SIN input must be between -Pi and Pi. return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, @@ -1088,7 +1064,7 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { } SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, + const SDLoc &DL, unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), @@ -1099,8 +1075,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR - MachinePointerInfo(ConstantPointerNull::get(PtrType)), - false, false, false, 0); + MachinePointerInfo(ConstantPointerNull::get(PtrType))); } bool R600TargetLowering::isZero(SDValue Op) const { @@ -1113,6 +1088,20 @@ bool R600TargetLowering::isZero(SDValue Op) const { } } +bool R600TargetLowering::isHWTrueValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->isExactlyValue(1.0); + } + return isAllOnesConstant(Op); +} + +bool R600TargetLowering::isHWFalseValue(SDValue Op) const { + if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { + return CFP->getValueAPF().isZero(); + } + return isNullConstant(Op); +} + SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); @@ -1311,19 +1300,73 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth, } } +SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, + SelectionDAG &DAG) const { + SDLoc DL(Store); + + unsigned Mask = 0; + if (Store->getMemoryVT() == MVT::i8) { + Mask = 0xff; + } else if (Store->getMemoryVT() == MVT::i16) { + Mask = 0xffff; + } + + SDValue Chain = Store->getChain(); + SDValue BasePtr = Store->getBasePtr(); + EVT MemVT = Store->getMemoryVT(); + + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, + DAG.getConstant(2, DL, MVT::i32)); + SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, + Chain, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); + + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, + DAG.getConstant(0x3, DL, MVT::i32)); + + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, + Store->getValue()); + + SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); + + SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, + MaskedValue, ShiftAmt); + + SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, + DAG.getConstant(Mask, DL, MVT::i32), + ShiftAmt); + DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, + DAG.getConstant(0xffffffff, DL, MVT::i32)); + Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); + + SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); + return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, + Chain, Value, Ptr, + DAG.getTargetConstant(0, DL, MVT::i32)); +} + SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { - SDLoc DL(Op); + if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) + return Result; + StoreSDNode *StoreNode = cast<StoreSDNode>(Op); - SDValue Chain = Op.getOperand(0); - SDValue Value = Op.getOperand(1); - SDValue Ptr = Op.getOperand(2); + unsigned AS = StoreNode->getAddressSpace(); + SDValue Value = StoreNode->getValue(); + EVT ValueVT = Value.getValueType(); - SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Result.getNode()) { - return Result; + if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && + ValueVT.isVector()) { + return SplitVectorStore(Op, DAG); } - if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { + SDLoc DL(Op); + SDValue Chain = StoreNode->getChain(); + SDValue Ptr = StoreNode->getBasePtr(); + + if (AS == AMDGPUAS::GLOBAL_ADDRESS) { if (StoreNode->isTruncatingStore()) { EVT VT = Value.getValueType(); assert(VT.bitsLE(MVT::i32)); @@ -1352,13 +1395,13 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(0, DL, MVT::i32), Mask }; - SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src); + SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); SDValue Args[3] = { Chain, Input, DWordAddr }; return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, Op->getVTList(), Args, MemVT, StoreNode->getMemOperand()); } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && - Value.getValueType().bitsGE(MVT::i32)) { + ValueVT.bitsGE(MVT::i32)) { // Convert pointer from byte address to dword address. Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), @@ -1373,21 +1416,16 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { } } - EVT ValueVT = Value.getValueType(); - - if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { + if (AS != AMDGPUAS::PRIVATE_ADDRESS) return SDValue(); - } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) { - return Ret; - } - // Lowering for indirect addressing + EVT MemVT = StoreNode->getMemoryVT(); + if (MemVT.bitsLT(MVT::i32)) + return lowerPrivateTruncStore(StoreNode, DAG); + // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1465,37 +1503,81 @@ ConstantAddressBlock(unsigned AddressSpace) { } } -SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const -{ - EVT VT = Op.getValueType(); +SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, + SelectionDAG &DAG) const { SDLoc DL(Op); - LoadSDNode *LoadNode = cast<LoadSDNode>(Op); - SDValue Chain = Op.getOperand(0); - SDValue Ptr = Op.getOperand(1); - SDValue LoweredLoad; + LoadSDNode *Load = cast<LoadSDNode>(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT MemVT = Load->getMemoryVT(); + + // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, + // register (2-)byte extract. + + // Get Register holding the target. + SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), + DAG.getConstant(2, DL, MVT::i32)); + // Load the Register. + SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), + Load->getChain(), + Ptr, + DAG.getTargetConstant(0, DL, MVT::i32), + Op.getOperand(2)); + + // Get offset within the register. + SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, + Load->getBasePtr(), + DAG.getConstant(0x3, DL, MVT::i32)); + + // Bit offset of target byte (byteIdx * 8). + SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, + DAG.getConstant(3, DL, MVT::i32)); + + // Shift to the right. + Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); + + // Eliminate the upper bits by setting them to ... + EVT MemEltVT = MemVT.getScalarType(); + + // ... ones. + if (ExtType == ISD::SEXTLOAD) { + SDValue MemEltVTNode = DAG.getValueType(MemEltVT); + + SDValue Ops[] = { + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), + Load->getChain() + }; - if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG)) - return Ret; + return DAG.getMergeValues(Ops, DL); + } + + // ... or zeros. + SDValue Ops[] = { + DAG.getZeroExtendInReg(Ret, DL, MemEltVT), + Load->getChain() + }; - // Lower loads constant address space global variable loads - if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && - isa<GlobalVariable>(GetUnderlyingObject( - LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) { + return DAG.getMergeValues(Ops, DL); +} - SDValue Ptr = DAG.getZExtOrTrunc( - LoadNode->getBasePtr(), DL, - getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS)); - Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, - DAG.getConstant(2, DL, MVT::i32)); - return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(), - LoadNode->getChain(), Ptr, - DAG.getTargetConstant(0, DL, MVT::i32), - Op.getOperand(2)); +SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *LoadNode = cast<LoadSDNode>(Op); + unsigned AS = LoadNode->getAddressSpace(); + EVT MemVT = LoadNode->getMemoryVT(); + ISD::LoadExtType ExtType = LoadNode->getExtensionType(); + + if (AS == AMDGPUAS::PRIVATE_ADDRESS && + ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { + return lowerPrivateExtLoad(Op, DAG); } + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue Chain = LoadNode->getChain(); + SDValue Ptr = LoadNode->getBasePtr(); + if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { SDValue MergedValues[2] = { - ScalarizeVectorLoad(Op, DAG), + scalarizeVectorLoad(LoadNode, DAG), Chain }; return DAG.getMergeValues(MergedValues, DL); @@ -1526,8 +1608,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const NewVT = VT; NumElements = VT.getVectorNumElements(); } - Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, - makeArrayRef(Slots, NumElements)); + Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); } else { // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, @@ -1550,6 +1631,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const return DAG.getMergeValues(MergedValues, DL); } + SDValue LoweredLoad; + // For most operations returning SDValue() will result in the node being // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we // need to manually expand loads that may be legal in some address spaces and @@ -1560,12 +1643,9 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { EVT MemVT = LoadNode->getMemoryVT(); assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); - SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, - LoadNode->getPointerInfo(), MemVT, - LoadNode->isVolatile(), - LoadNode->isNonTemporal(), - LoadNode->isInvariant(), - LoadNode->getAlignment()); + SDValue NewLoad = DAG.getExtLoad( + ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, + LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, DAG.getValueType(MemVT)); @@ -1579,8 +1659,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const // Lowering for indirect addressing const MachineFunction &MF = DAG.getMachineFunction(); - const AMDGPUFrameLowering *TFL = - static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering()); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); @@ -1590,6 +1669,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const EVT ElemVT = VT.getVectorElementType(); SDValue Loads[4]; + assert(NumElemVT <= 4); assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " "vector width in load"); @@ -1603,11 +1683,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const DAG.getTargetConstant(Channel, DL, MVT::i32), Op.getOperand(2)); } - for (unsigned i = NumElemVT; i < 4; ++i) { - Loads[i] = DAG.getUNDEF(ElemVT); - } - EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); - LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads); + EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); + LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); } else { LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, Chain, Ptr, @@ -1632,16 +1709,28 @@ SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { Chain, Jump, Cond); } +SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); + + FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); + + unsigned FrameIndex = FIN->getIndex(); + unsigned IgnoredFrameReg; + unsigned Offset = + TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); + return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), + Op.getValueType()); +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. SDValue R600TargetLowering::LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); @@ -1664,7 +1753,7 @@ SDValue R600TargetLowering::LowerFormalArguments( MemVT = MemVT.getVectorElementType(); } - if (MFI->getShaderType() != ShaderType::COMPUTE) { + if (AMDGPU::isShader(CallConv)) { unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); @@ -1699,11 +1788,11 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned Offset = 36 + VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); - SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, DL, MVT::i32), - DAG.getUNDEF(MVT::i32), - PtrInfo, - MemVT, false, true, true, 4); + SDValue Arg = DAG.getLoad( + ISD::UNINDEXED, Ext, VT, DL, Chain, + DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, + MemVT, /* Alignment = */ 4, + MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant); // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); @@ -1719,6 +1808,26 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } +bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + if (!VT.isSimple() || VT == MVT::Other) + return false; + + if (VT.bitsLT(MVT::i32)) + return false; + + // TODO: This is a rough estimate. + if (IsFast) + *IsFast = true; + + return VT.bitsGT(MVT::i32) && Align % 4 == 0; +} + static SDValue CompactSwizzlableVector( SelectionDAG &DAG, SDValue VectorEntry, DenseMap<unsigned, unsigned> &RemapSwizzle) { @@ -1732,7 +1841,7 @@ static SDValue CompactSwizzlableVector( }; for (unsigned i = 0; i < 4; i++) { - if (NewBldVec[i].getOpcode() == ISD::UNDEF) + if (NewBldVec[i].isUndef()) // We mask write here to teach later passes that the ith element of this // vector is undef. Thus we can use it to reduce 128 bits reg usage, // break false dependencies and additionnaly make assembly easier to read. @@ -1747,7 +1856,7 @@ static SDValue CompactSwizzlableVector( } } - if (NewBldVec[i].getOpcode() == ISD::UNDEF) + if (NewBldVec[i].isUndef()) continue; for (unsigned j = 0; j < i; j++) { if (NewBldVec[i] == NewBldVec[j]) { @@ -1758,8 +1867,8 @@ static SDValue CompactSwizzlableVector( } } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); + return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), + NewBldVec); } static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, @@ -1796,14 +1905,13 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, } } - return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), - VectorEntry.getValueType(), NewBldVec); + return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), + NewBldVec); } - -SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, - SDValue Swz[4], SelectionDAG &DAG, - SDLoc DL) const { +SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], + SelectionDAG &DAG, + const SDLoc &DL) const { assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); // Old -> New swizzle values DenseMap<unsigned, unsigned> SwizzleRemap; @@ -1886,7 +1994,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); // If the inserted element is an UNDEF, just use the input vector. - if (InVal.getOpcode() == ISD::UNDEF) + if (InVal.isUndef()) return InVec; EVT VT = InVec.getValueType(); @@ -1907,7 +2015,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, if (InVec.getOpcode() == ISD::BUILD_VECTOR) { Ops.append(InVec.getNode()->op_begin(), InVec.getNode()->op_end()); - } else if (InVec.getOpcode() == ISD::UNDEF) { + } else if (InVec.isUndef()) { unsigned NElts = VT.getVectorNumElements(); Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); } else { @@ -1927,7 +2035,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, } // Return the new vector - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); + return DAG.getBuildVector(VT, dl, Ops); } // Extract_vec (Build_vector) generated by custom lowering @@ -1953,8 +2061,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT_CC: { // Try common optimizations - SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI); - if (Ret.getNode()) + if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) return Ret; // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> @@ -2053,13 +2160,14 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } -static bool -FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, - SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); +bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, + SDValue &Src, SDValue &Neg, SDValue &Abs, + SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const { + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); if (!Src.isMachineOpcode()) return false; + switch (Src.getMachineOpcode()) { case AMDGPU::FNEG_R600: if (!Neg.getNode()) @@ -2127,6 +2235,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); return true; } + case AMDGPU::MOV_IMM_GLOBAL_ADDR: + // Check if the Imm slot is used. Taken from below. + if (cast<ConstantSDNode>(Imm)->getZExtValue()) + return false; + Imm = Src.getOperand(0); + Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32); + return true; case AMDGPU::MOV_IMM_I32: case AMDGPU::MOV_IMM_F32: { unsigned ImmReg = AMDGPU::ALU_LITERAL_X; @@ -2177,14 +2292,13 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, } } - /// \brief Fold the instructions after selecting them SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo()); + const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); if (!Node->isMachineOpcode()) return Node; + unsigned Opcode = Node->getMachineOpcode(); SDValue FakeOp; diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index 4dbac97af2a1..2fb6ee25caa9 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -12,55 +12,69 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600ISELLOWERING_H -#define LLVM_LIB_TARGET_R600_R600ISELLOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_R600ISELLOWERING_H #include "AMDGPUISelLowering.h" namespace llvm { class R600InstrInfo; +class R600Subtarget; -class R600TargetLowering : public AMDGPUTargetLowering { +class R600TargetLowering final : public AMDGPUTargetLowering { public: - R600TargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock * BB) const override; + R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI); + + const R600Subtarget *getSubtarget() const; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; - SDValue LowerFormalArguments( - SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const override; + bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, + unsigned Align, + bool *IsFast) const override; + private: unsigned Gen; /// Each OpenCL kernel has nine implicit parameters that are stored in the /// first nine dwords of a Vertex Buffer. These implicit parameters are /// lowered to load instructions which retrieve the values from the Vertex /// Buffer. - SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, - SDLoc DL, unsigned DwordOffset) const; + SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT, const SDLoc &DL, + unsigned DwordOffset) const; void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG, - SDLoc DL) const; + const SDLoc &DL) const; SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; + SDValue lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, + SelectionDAG &DAG) const override; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerPrivateTruncStore(StoreSDNode *Store, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue lowerPrivateExtLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; @@ -74,6 +88,13 @@ private: void getStackAddress(unsigned StackWidth, unsigned ElemIdx, unsigned &Channel, unsigned &PtrIncr) const; bool isZero(SDValue Op) const; + bool isHWTrueValue(SDValue Op) const; + bool isHWFalseValue(SDValue Op) const; + + bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, + SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const; + SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; }; diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp index 8b6eea17130b..1c5f7ec1b6ef 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -28,26 +28,17 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenDFAPacketizer.inc" -R600InstrInfo::R600InstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} - -const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const { - return RI; -} - -bool R600InstrInfo::isTrig(const MachineInstr &MI) const { - return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; -} +R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) + : AMDGPUInstrInfo(ST), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; } -void -R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { +void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { unsigned VectorComponents = 0; if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && @@ -91,10 +82,9 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB, } bool R600InstrInfo::isMov(unsigned Opcode) const { - - switch(Opcode) { - default: return false; + default: + return false; case AMDGPU::MOV: case AMDGPU::MOV_IMM_F32: case AMDGPU::MOV_IMM_I32: @@ -102,17 +92,6 @@ bool R600InstrInfo::isMov(unsigned Opcode) const { } } -// Some instructions act as place holders to emulate operations that the GPU -// hardware does automatically. This function can be used to check if -// an opcode falls into this category. -bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const { - switch (Opcode) { - default: return false; - case AMDGPU::RETURN: - return true; - } -} - bool R600InstrInfo::isReductionOp(unsigned Opcode) const { return false; } @@ -150,20 +129,16 @@ bool R600InstrInfo::isLDSInstr(unsigned Opcode) const { (TargetFlags & R600_InstFlag::LDS_1A2D)); } -bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1; -} - bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; } -bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { - if (isALUInstr(MI->getOpcode())) +bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const { + if (isALUInstr(MI.getOpcode())) return true; - if (isVector(*MI) || isCubeOp(MI->getOpcode())) + if (isVector(MI) || isCubeOp(MI.getOpcode())) return true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::PRED_X: case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: @@ -182,16 +157,16 @@ bool R600InstrInfo::isTransOnly(unsigned Opcode) const { return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); } -bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const { - return isTransOnly(MI->getOpcode()); +bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const { + return isTransOnly(MI.getOpcode()); } bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); } -bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const { - return isVectorOnly(MI->getOpcode()); +bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const { + return isVectorOnly(MI.getOpcode()); } bool R600InstrInfo::isExport(unsigned Opcode) const { @@ -202,23 +177,21 @@ bool R600InstrInfo::usesVertexCache(unsigned Opcode) const { return ST.hasVertexCache() && IS_VTX(get(Opcode)); } -bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); - return MFI->getShaderType() != ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode()); +bool R600InstrInfo::usesVertexCache(const MachineInstr &MI) const { + const MachineFunction *MF = MI.getParent()->getParent(); + return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && + usesVertexCache(MI.getOpcode()); } bool R600InstrInfo::usesTextureCache(unsigned Opcode) const { return (!ST.hasVertexCache() && IS_VTX(get(Opcode))) || IS_TEX(get(Opcode)); } -bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); - return (MFI->getShaderType() == ShaderType::COMPUTE && - usesVertexCache(MI->getOpcode())) || - usesTextureCache(MI->getOpcode()); +bool R600InstrInfo::usesTextureCache(const MachineInstr &MI) const { + const MachineFunction *MF = MI.getParent()->getParent(); + return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && + usesVertexCache(MI.getOpcode())) || + usesTextureCache(MI.getOpcode()); } bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { @@ -231,20 +204,21 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { } } -bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; +bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const { + return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; } -bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const { - return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; +bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const { + return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; } -bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { - if (!isALUInstr(MI->getOpcode())) { +bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { + if (!isALUInstr(MI.getOpcode())) { return false; } - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { + for (MachineInstr::const_mop_iterator I = MI.operands_begin(), + E = MI.operands_end(); + I != E; ++I) { if (!I->isReg() || !I->isUse() || TargetRegisterInfo::isVirtualRegister(I->getReg())) continue; @@ -255,17 +229,6 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const { return false; } -int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const { - static const unsigned OpTable[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 - }; - - assert (SrcNum < 3); - return getOperandIdx(Opcode, OpTable[SrcNum]); -} - int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { static const unsigned SrcSelTable[][2] = { {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, @@ -290,10 +253,10 @@ int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { } SmallVector<std::pair<MachineOperand *, int64_t>, 3> -R600InstrInfo::getSrcs(MachineInstr *MI) const { +R600InstrInfo::getSrcs(MachineInstr &MI) const { SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result; - if (MI->getOpcode() == AMDGPU::DOT_4) { + if (MI.getOpcode() == AMDGPU::DOT_4) { static const unsigned OpTable[8][2] = { {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, @@ -306,13 +269,13 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const { }; for (unsigned j = 0; j < 8; j++) { - MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][0])); + MachineOperand &MO = + MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); unsigned Reg = MO.getReg(); if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][1])).getImm(); - Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel)); + MachineOperand &Sel = + MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } @@ -327,30 +290,33 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const { }; for (unsigned j = 0; j < 3; j++) { - int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); + int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]); if (SrcIdx < 0) break; - MachineOperand &MO = MI->getOperand(SrcIdx); - unsigned Reg = MI->getOperand(SrcIdx).getReg(); + MachineOperand &MO = MI.getOperand(SrcIdx); + unsigned Reg = MO.getReg(); if (Reg == AMDGPU::ALU_CONST) { - unsigned Sel = MI->getOperand( - getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm(); - Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel)); + MachineOperand &Sel = + MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } if (Reg == AMDGPU::ALU_LITERAL_X) { - unsigned Imm = MI->getOperand( - getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm(); - Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm)); - continue; + MachineOperand &Operand = + MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + if (Operand.isImm()) { + Result.push_back(std::make_pair(&MO, Operand.getImm())); + continue; + } + assert(Operand.isGlobal()); } - Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0)); + Result.push_back(std::make_pair(&MO, 0)); } return Result; } -std::vector<std::pair<int, unsigned> > -R600InstrInfo::ExtractSrcs(MachineInstr *MI, +std::vector<std::pair<int, unsigned>> +R600InstrInfo::ExtractSrcs(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const { ConstCount = 0; @@ -360,13 +326,13 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI, unsigned i = 0; for (unsigned n = Srcs.size(); i < n; ++i) { unsigned Reg = Srcs[i].first->getReg(); - unsigned Index = RI.getEncodingValue(Reg) & 0xff; + int Index = RI.getEncodingValue(Reg) & 0xff; if (Reg == AMDGPU::OQAP) { - Result.push_back(std::pair<int, unsigned>(Index, 0)); + Result.push_back(std::make_pair(Index, 0U)); } if (PV.find(Reg) != PV.end()) { // 255 is used to tells its a PS/PV reg - Result.push_back(std::pair<int, unsigned>(255, 0)); + Result.push_back(std::make_pair(255, 0U)); continue; } if (Index > 127) { @@ -375,7 +341,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr *MI, continue; } unsigned Chan = RI.getHWRegChan(Reg); - Result.push_back(std::pair<int, unsigned>(Index, Chan)); + Result.push_back(std::make_pair(Index, Chan)); } for (; i < 3; ++i) Result.push_back(DummyPair); @@ -411,8 +377,7 @@ Swizzle(std::vector<std::pair<int, unsigned> > Src, return Src; } -static unsigned -getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { +static unsigned getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { switch (Swz) { case R600InstrInfo::ALU_VEC_012_SCL_210: { unsigned Cycles[3] = { 2, 1, 0}; @@ -432,7 +397,6 @@ getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) { } default: llvm_unreachable("Wrong Swizzle for Trans Slot"); - return 0; } } @@ -557,7 +521,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, unsigned ConstCount; BankSwizzle TransBS = ALU_VEC_012_SCL_210; for (unsigned i = 0, e = IG.size(); i < e; ++i) { - IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount)); + IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount)); unsigned Op = getOperandIdx(IG[i]->getOpcode(), AMDGPU::OpName::bank_swizzle); ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) @@ -624,14 +588,13 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) std::vector<unsigned> Consts; SmallSet<int64_t, 4> Literals; for (unsigned i = 0, n = MIs.size(); i < n; i++) { - MachineInstr *MI = MIs[i]; - if (!isALUInstr(MI->getOpcode())) + MachineInstr &MI = *MIs[i]; + if (!isALUInstr(MI.getOpcode())) continue; ArrayRef<std::pair<MachineOperand *, int64_t>> Srcs = getSrcs(MI); - for (unsigned j = 0, e = Srcs.size(); j < e; j++) { - std::pair<MachineOperand *, unsigned> Src = Srcs[j]; + for (const auto &Src:Srcs) { if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) Literals.insert(Src.second); if (Literals.size() > 4) @@ -652,7 +615,7 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) DFAPacketizer * R600InstrInfo::CreateTargetScheduleState(const TargetSubtargetInfo &STI) const { const InstrItineraryData *II = STI.getInstrItineraryData(); - return static_cast<const AMDGPUSubtarget &>(STI).createDFAPacketizer(II); + return static_cast<const R600Subtarget &>(STI).createDFAPacketizer(II); } static bool @@ -670,9 +633,9 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { while (I != MBB.begin()) { --I; - MachineInstr *MI = I; - if (isPredicateSetter(MI->getOpcode())) - return MI; + MachineInstr &MI = *I; + if (isPredicateSetter(MI.getOpcode())) + return &MI; } return nullptr; @@ -688,12 +651,11 @@ static bool isBranch(unsigned Opcode) { Opcode == AMDGPU::BRANCH_COND_f32; } -bool -R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { +bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { // Most of the following comes from the ARM implementation of AnalyzeBranch // If the block has no terminators, it just falls into the block after it. @@ -716,21 +678,21 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, I->removeFromParent(); I = PriorI; } - MachineInstr *LastInst = I; + MachineInstr &LastInst = *I; // If there is only one terminator instruction, process it. - unsigned LastOpc = LastInst->getOpcode(); + unsigned LastOpc = LastInst.getOpcode(); if (I == MBB.begin() || !isJump(static_cast<MachineInstr *>(--I)->getOpcode())) { if (LastOpc == AMDGPU::JUMP) { - TBB = LastInst->getOperand(0).getMBB(); + TBB = LastInst.getOperand(0).getMBB(); return false; } else if (LastOpc == AMDGPU::JUMP_COND) { - MachineInstr *predSet = I; + auto predSet = I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; } - TBB = LastInst->getOperand(0).getMBB(); + TBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); @@ -740,17 +702,17 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - unsigned SecondLastOpc = SecondLastInst->getOpcode(); + MachineInstr &SecondLastInst = *I; + unsigned SecondLastOpc = SecondLastInst.getOpcode(); // If the block ends with a B and a Bcc, handle it. if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { - MachineInstr *predSet = --I; + auto predSet = --I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; } - TBB = SecondLastInst->getOperand(0).getMBB(); - FBB = LastInst->getOperand(0).getMBB(); + TBB = SecondLastInst.getOperand(0).getMBB(); + FBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); @@ -772,12 +734,11 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { return MBB.end(); } -unsigned -R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, - MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, - DebugLoc DL) const { +unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const { assert(TBB && "InsertBranch must not be told to insert a fallthrough"); if (!FBB) { @@ -787,7 +748,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); + addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) @@ -803,7 +764,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); assert(PredSet && "No previous predicate !"); - addFlag(PredSet, 0, MO_FLAG_PUSH); + addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) .addMBB(TBB) @@ -835,7 +796,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 0; case AMDGPU::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); + clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) @@ -860,7 +821,7 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 1; case AMDGPU::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); - clearFlag(predSet, 0, MO_FLAG_PUSH); + clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) @@ -876,13 +837,12 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return 2; } -bool -R600InstrInfo::isPredicated(const MachineInstr *MI) const { - int idx = MI->findFirstPredOperandIdx(); +bool R600InstrInfo::isPredicated(const MachineInstr &MI) const { + int idx = MI.findFirstPredOperandIdx(); if (idx < 0) return false; - unsigned Reg = MI->getOperand(idx).getReg(); + unsigned Reg = MI.getOperand(idx).getReg(); switch (Reg) { default: return false; case AMDGPU::PRED_SEL_ONE: @@ -892,25 +852,22 @@ R600InstrInfo::isPredicated(const MachineInstr *MI) const { } } -bool -R600InstrInfo::isPredicable(MachineInstr *MI) const { +bool R600InstrInfo::isPredicable(MachineInstr &MI) const { // XXX: KILL* instructions can be predicated, but they must be the last // instruction in a clause, so this means any instructions after them cannot // be predicated. Until we have proper support for instruction clauses in the // backend, we will mark KILL* instructions as unpredicable. - if (MI->getOpcode() == AMDGPU::KILLGT) { + if (MI.getOpcode() == AMDGPU::KILLGT) { return false; - } else if (MI->getOpcode() == AMDGPU::CF_ALU) { + } else if (MI.getOpcode() == AMDGPU::CF_ALU) { // If the clause start in the middle of MBB then the MBB has more // than a single clause, unable to predicate several clauses. - if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) + if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI)) return false; // TODO: We don't support KC merging atm - if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) - return false; - return true; - } else if (isVector(*MI)) { + return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0; + } else if (isVector(MI)) { return false; } else { return AMDGPUInstrInfo::isPredicable(MI); @@ -986,48 +943,39 @@ R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) con return false; } -bool -R600InstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const { - return isPredicateSetter(MI->getOpcode()); +bool R600InstrInfo::DefinesPredicate(MachineInstr &MI, + std::vector<MachineOperand> &Pred) const { + return isPredicateSetter(MI.getOpcode()); } -bool -R600InstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const { - return false; -} - - -bool -R600InstrInfo::PredicateInstruction(MachineInstr *MI, - ArrayRef<MachineOperand> Pred) const { - int PIdx = MI->findFirstPredOperandIdx(); +bool R600InstrInfo::PredicateInstruction(MachineInstr &MI, + ArrayRef<MachineOperand> Pred) const { + int PIdx = MI.findFirstPredOperandIdx(); - if (MI->getOpcode() == AMDGPU::CF_ALU) { - MI->getOperand(8).setImm(0); + if (MI.getOpcode() == AMDGPU::CF_ALU) { + MI.getOperand(8).setImm(0); return true; } - if (MI->getOpcode() == AMDGPU::DOT_4) { - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) + if (MI.getOpcode() == AMDGPU::DOT_4) { + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X)) .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y)) .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z)) + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z)) .setReg(Pred[2].getReg()); - MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W)) + MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W)) .setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); return true; } if (PIdx != -1) { - MachineOperand &PMO = MI->getOperand(PIdx); + MachineOperand &PMO = MI.getOperand(PIdx); PMO.setReg(Pred[2].getReg()); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); + MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); return true; } @@ -1035,45 +983,94 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI, return false; } -unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const { +unsigned int R600InstrInfo::getPredicationCost(const MachineInstr &) const { return 2; } unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, + const MachineInstr &, unsigned *PredCost) const { if (PredCost) *PredCost = 2; return 2; } -bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { +unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, + unsigned Channel) const { + assert(Channel == 0); + return RegIndex; +} - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); +bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: { + MachineBasicBlock *MBB = MI.getParent(); + int OffsetOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr); + // addr is a custom operand with multiple MI operands, and only the + // first MI operand is given a name. + int RegOpIdx = OffsetOpIdx + 1; + int ChanOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan); + if (isRegisterLoad(MI)) { + int DstOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); + unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(), + getIndirectAddrRegClass()->getRegister(Address)); + } else { + buildIndirectRead(MBB, MI, MI.getOperand(DstOpIdx).getReg(), Address, + OffsetReg); + } + } else if (isRegisterStore(MI)) { + int ValOpIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val); + unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); + unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); + unsigned Address = calculateIndirectAddress(RegIndex, Channel); + unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); + if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), + MI.getOperand(ValOpIdx).getReg()); + } else { + buildIndirectWrite(MBB, MI, MI.getOperand(ValOpIdx).getReg(), + calculateIndirectAddress(RegIndex, Channel), + OffsetReg); + } + } else { + return false; + } + + MBB->erase(MI); + return true; + } case AMDGPU::R600_EXTRACT_ELT_V2: case AMDGPU::R600_EXTRACT_ELT_V4: - buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(2).getReg(), - RI.getHWRegChan(MI->getOperand(1).getReg())); + buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(), + RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address + MI.getOperand(2).getReg(), + RI.getHWRegChan(MI.getOperand(1).getReg())); break; case AMDGPU::R600_INSERT_ELT_V2: case AMDGPU::R600_INSERT_ELT_V4: - buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value - RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address - MI->getOperand(3).getReg(), // Offset - RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address + MI.getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI.getOperand(1).getReg())); // Channel break; } - MI->eraseFromParent(); + MI.eraseFromParent(); return true; } void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { - const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>( - MF.getSubtarget().getFrameLowering()); + const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); + const R600FrameLowering *TFL = ST.getFrameLowering(); unsigned StackWidth = TFL->getStackWidth(MF); int End = getIndirectIndexEnd(MF); @@ -1091,13 +1088,6 @@ void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, } } -unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - // XXX: Remove when we support a stack width > 2 - assert(Channel == 0); - return RegIndex; -} - const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::R600_TReg32_XRegClass; } @@ -1124,13 +1114,13 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); + setImmOperand(*MOVA, AMDGPU::OpName::write, 0); MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, AddrReg, ValueReg) .addReg(AMDGPU::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1); + setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1); return Mov; } @@ -1157,17 +1147,74 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); - setImmOperand(MOVA, AMDGPU::OpName::write, 0); + setImmOperand(*MOVA, AMDGPU::OpName::write, 0); MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, ValueReg, AddrReg) .addReg(AMDGPU::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1); + setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1); return Mov; } +int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + int Offset = -1; + + if (MFI->getNumObjects() == 0) { + return -1; + } + + if (MRI.livein_empty()) { + return 0; + } + + const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass(); + for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(), + LE = MRI.livein_end(); + LI != LE; ++LI) { + unsigned Reg = LI->first; + if (TargetRegisterInfo::isVirtualRegister(Reg) || + !IndirectRC->contains(Reg)) + continue; + + unsigned RegIndex; + unsigned RegEnd; + for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd; + ++RegIndex) { + if (IndirectRC->getRegister(RegIndex) == Reg) + break; + } + Offset = std::max(Offset, (int)RegIndex); + } + + return Offset + 1; +} + +int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { + int Offset = 0; + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Variable sized objects are not supported + if (MFI->hasVarSizedObjects()) { + return -1; + } + + if (MFI->getNumObjects() == 0) { + return -1; + } + + const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); + const R600FrameLowering *TFL = ST.getFrameLowering(); + + unsigned IgnoredFrameReg; + Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg); + + return getIndirectIndexBegin(MF) + Offset; +} + unsigned R600InstrInfo::getMaxAlusPerClause() const { return 115; } @@ -1256,7 +1303,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( const { assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); unsigned Opcode; - if (ST.getGeneration() <= AMDGPUSubtarget::R700) + if (ST.getGeneration() <= R600Subtarget::R700) Opcode = AMDGPU::DOT4_r600; else Opcode = AMDGPU::DOT4_eg; @@ -1293,7 +1340,7 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MachineOperand &MO = MI->getOperand( getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); assert (MO.isImm()); - setImmOperand(MIB, Operands[i], MO.getImm()); + setImmOperand(*MIB, Operands[i], MO.getImm()); } MIB->getOperand(20).setImm(0); return MIB; @@ -1305,7 +1352,7 @@ MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, uint64_t Imm) const { MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, AMDGPU::ALU_LITERAL_X); - setImmOperand(MovImm, AMDGPU::OpName::literal, Imm); + setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm); return MovImm; } @@ -1323,25 +1370,21 @@ int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { return AMDGPU::getNamedOperandIdx(Opcode, Op); } -void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op, +void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const { - int Idx = getOperandIdx(*MI, Op); + int Idx = getOperandIdx(MI, Op); assert(Idx != -1 && "Operand not supported for this instruction."); - assert(MI->getOperand(Idx).isImm()); - MI->getOperand(Idx).setImm(Imm); + assert(MI.getOperand(Idx).isImm()); + MI.getOperand(Idx).setImm(Imm); } //===----------------------------------------------------------------------===// // Instruction flag getters/setters //===----------------------------------------------------------------------===// -bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const { - return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0; -} - -MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, +MachineOperand &R600InstrInfo::getFlagOp(MachineInstr &MI, unsigned SrcIdx, unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + unsigned TargetFlags = get(MI.getOpcode()).TSFlags; int FlagIndex = 0; if (Flag != 0) { // If we pass something other than the default value of Flag to this @@ -1351,20 +1394,26 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; switch (Flag) { case MO_FLAG_CLAMP: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp); + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp); break; case MO_FLAG_MASK: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write); + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write); break; case MO_FLAG_NOT_LAST: case MO_FLAG_LAST: - FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last); + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last); break; case MO_FLAG_NEG: switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break; - case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break; + case 0: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg); + break; + case 1: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg); + break; + case 2: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg); + break; } break; @@ -1373,8 +1422,12 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, "instructions."); (void)IsOP3; switch (SrcIdx) { - case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break; - case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break; + case 0: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs); + break; + case 1: + FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs); + break; } break; @@ -1389,14 +1442,14 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx, "Instruction flags not supported for this instruction"); } - MachineOperand &FlagOp = MI->getOperand(FlagIndex); + MachineOperand &FlagOp = MI.getOperand(FlagIndex); assert(FlagOp.isImm()); return FlagOp; } -void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, +void R600InstrInfo::addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + unsigned TargetFlags = get(MI.getOpcode()).TSFlags; if (Flag == 0) { return; } @@ -1415,9 +1468,9 @@ void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand, } } -void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, +void R600InstrInfo::clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const { - unsigned TargetFlags = get(MI->getOpcode()).TSFlags; + unsigned TargetFlags = get(MI.getOpcode()).TSFlags; if (HAS_NATIVE_OPERANDS(TargetFlags)) { MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag); FlagOp.setImm(0); @@ -1428,3 +1481,11 @@ void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand, FlagOp.setImm(InstFlags); } } + +bool R600InstrInfo::isRegisterStore(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_STORE; +} + +bool R600InstrInfo::isRegisterLoad(const MachineInstr &MI) const { + return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD; +} diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h index e7251c31107b..feaca98def44 100644 --- a/lib/Target/AMDGPU/R600InstrInfo.h +++ b/lib/Target/AMDGPU/R600InstrInfo.h @@ -12,30 +12,28 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600INSTRINFO_H -#define LLVM_LIB_TARGET_R600_R600INSTRINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H +#define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H #include "AMDGPUInstrInfo.h" -#include "R600Defines.h" #include "R600RegisterInfo.h" -#include <map> namespace llvm { - - class AMDGPUTargetMachine; - class DFAPacketizer; - class ScheduleDAG; - class MachineFunction; - class MachineInstr; - class MachineInstrBuilder; - - class R600InstrInfo : public AMDGPUInstrInfo { - private: +class AMDGPUTargetMachine; +class DFAPacketizer; +class MachineFunction; +class MachineInstr; +class MachineInstrBuilder; +class R600Subtarget; + +class R600InstrInfo final : public AMDGPUInstrInfo { +private: const R600RegisterInfo RI; + const R600Subtarget &ST; - std::vector<std::pair<int, unsigned> > - ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const; - + std::vector<std::pair<int, unsigned>> + ExtractSrcs(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV, + unsigned &ConstCount) const; MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, @@ -44,11 +42,11 @@ namespace llvm { unsigned AddrChan) const; MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg, - unsigned AddrChan) const; - public: + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; +public: enum BankSwizzle { ALU_VEC_012_SCL_210 = 0, ALU_VEC_021_SCL_122, @@ -58,18 +56,18 @@ namespace llvm { ALU_VEC_210 }; - explicit R600InstrInfo(const AMDGPUSubtarget &st); + explicit R600InstrInfo(const R600Subtarget &); - const R600RegisterInfo &getRegisterInfo() const override; - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, + const R600RegisterInfo &getRegisterInfo() const { + return RI; + } + + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; bool isLegalToSplitMBBAt(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const override; - bool isTrig(const MachineInstr &MI) const; - bool isPlaceHolderOpcode(unsigned opcode) const; bool isReductionOp(unsigned opcode) const; bool isCubeOp(unsigned opcode) const; @@ -77,32 +75,28 @@ namespace llvm { bool isALUInstr(unsigned Opcode) const; bool hasInstrModifiers(unsigned Opcode) const; bool isLDSInstr(unsigned Opcode) const; - bool isLDSNoRetInstr(unsigned Opcode) const; bool isLDSRetInstr(unsigned Opcode) const; /// \returns true if this \p Opcode represents an ALU instruction or an /// instruction that will be lowered in ExpandSpecialInstrs Pass. - bool canBeConsideredALU(const MachineInstr *MI) const; + bool canBeConsideredALU(const MachineInstr &MI) const; bool isTransOnly(unsigned Opcode) const; - bool isTransOnly(const MachineInstr *MI) const; + bool isTransOnly(const MachineInstr &MI) const; bool isVectorOnly(unsigned Opcode) const; - bool isVectorOnly(const MachineInstr *MI) const; + bool isVectorOnly(const MachineInstr &MI) const; bool isExport(unsigned Opcode) const; bool usesVertexCache(unsigned Opcode) const; - bool usesVertexCache(const MachineInstr *MI) const; + bool usesVertexCache(const MachineInstr &MI) const; bool usesTextureCache(unsigned Opcode) const; - bool usesTextureCache(const MachineInstr *MI) const; + bool usesTextureCache(const MachineInstr &MI) const; bool mustBeLastInClause(unsigned Opcode) const; - bool usesAddressRegister(MachineInstr *MI) const; - bool definesAddressRegister(MachineInstr *MI) const; - bool readsLDSSrcReg(const MachineInstr *MI) const; + bool usesAddressRegister(MachineInstr &MI) const; + bool definesAddressRegister(MachineInstr &MI) const; + bool readsLDSSrcReg(const MachineInstr &MI) const; - /// \returns The operand index for the given source number. Legal values - /// for SrcNum are 0, 1, and 2. - int getSrcIdx(unsigned Opcode, unsigned SrcNum) const; /// \returns The operand Index for the Sel operand given an index to one /// of the instruction's src operands. int getSelIdx(unsigned Opcode, unsigned SrcIdx) const; @@ -113,7 +107,7 @@ namespace llvm { /// If register is ALU_LITERAL, second member is IMM. /// Otherwise, second member value is undefined. SmallVector<std::pair<MachineOperand *, int64_t>, 3> - getSrcs(MachineInstr *MI) const; + getSrcs(MachineInstr &MI) const; unsigned isLegalUpTo( const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs, @@ -152,89 +146,107 @@ namespace llvm { /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; - bool isMov(unsigned Opcode) const override; + bool isMov(unsigned Opcode) const; DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &) const override; - bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + bool ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const override; - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - DebugLoc DL) const override; + const DebugLoc &DL) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - bool isPredicated(const MachineInstr *MI) const override; + bool isPredicated(const MachineInstr &MI) const override; - bool isPredicable(MachineInstr *MI) const override; + bool isPredicable(MachineInstr &MI) const override; - bool - isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - BranchProbability Probability) const override; + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, BranchProbability Probability) const override ; - bool - isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, unsigned ExtraFCycles, - BranchProbability Probability) const override; - - bool DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const override; + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + BranchProbability Probability) const override; - bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const override; + bool DefinesPredicate(MachineInstr &MI, + std::vector<MachineOperand> &Pred) const override; bool isProfitableToUnpredicate(MachineBasicBlock &TMBB, - MachineBasicBlock &FMBB) const override; + MachineBasicBlock &FMBB) const override; - bool PredicateInstruction(MachineInstr *MI, + bool PredicateInstruction(MachineInstr &MI, ArrayRef<MachineOperand> Pred) const override; - unsigned int getPredicationCost(const MachineInstr *) const override; + unsigned int getPredicationCost(const MachineInstr &) const override; unsigned int getInstrLatency(const InstrItineraryData *ItinData, - const MachineInstr *MI, + const MachineInstr &MI, unsigned *PredCost = nullptr) const override; - int getInstrLatency(const InstrItineraryData *ItinData, - SDNode *Node) const override { return 1;} - - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const; - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; + /// Calculate the "Indirect Address" for the given \p RegIndex and + /// \p Channel + /// + /// We model indirect addressing using a virtual address space that can be + /// accesed with loads and stores. The "Indirect Address" is the memory + /// address in this virtual address space that maps to the given \p RegIndex + /// and \p Channel. + unsigned calculateIndirectAddress(unsigned RegIndex, unsigned Channel) const; + + + /// \returns The register class to be used for loading and storing values + /// from an "Indirect Address" . + const TargetRegisterClass *getIndirectAddrRegClass() const; + + /// \returns the smallest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexBegin(const MachineFunction &MF) const; - const TargetRegisterClass *getIndirectAddrRegClass() const override; + /// \returns the largest register index that will be accessed by an indirect + /// read or write or -1 if indirect addressing is not used by this program. + int getIndirectIndexEnd(const MachineFunction &MF) const; + /// \brief Build instruction(s) for an indirect register write. + /// + /// \returns The instruction that performs the indirect register write MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg) const; + /// \brief Build instruction(s) for an indirect register read. + /// + /// \returns The instruction that performs the indirect register read MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, - unsigned OffsetReg) const override; + unsigned OffsetReg) const; unsigned getMaxAlusPerClause() const; - ///buildDefaultInstruction - This function returns a MachineInstr with - /// all the instruction modifiers initialized to their default values. - /// You can use this function to avoid manually specifying each instruction - /// modifier operand when building a new instruction. + /// buildDefaultInstruction - This function returns a MachineInstr with all + /// the instruction modifiers initialized to their default values. You can + /// use this function to avoid manually specifying each instruction modifier + /// operand when building a new instruction. /// /// \returns a MachineInstr with all the instruction modifiers initialized /// to their default values. @@ -251,13 +263,13 @@ namespace llvm { unsigned DstReg) const; MachineInstr *buildMovImm(MachineBasicBlock &BB, - MachineBasicBlock::iterator I, - unsigned DstReg, - uint64_t Imm) const; + MachineBasicBlock::iterator I, + unsigned DstReg, + uint64_t Imm) const; MachineInstr *buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; + unsigned DstReg, unsigned SrcReg) const; /// \brief Get the index of Op in the MachineInstr. /// @@ -270,13 +282,10 @@ namespace llvm { int getOperandIdx(unsigned Opcode, unsigned Op) const; /// \brief Helper function for setting instruction flag values. - void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const; - - /// \returns true if this instruction has an operand for storing target flags. - bool hasFlagOperand(const MachineInstr &MI) const; + void setImmOperand(MachineInstr &MI, unsigned Op, int64_t Imm) const; ///\brief Add one of the MO_FLAG* flags to the specified \p Operand. - void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + void addFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; ///\brief Determine if the specified \p Flag is set on this \p Operand. bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const; @@ -285,11 +294,15 @@ namespace llvm { /// \param Flag The flag being set. /// /// \returns the operand containing the flags for this instruction. - MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0, + MachineOperand &getFlagOp(MachineInstr &MI, unsigned SrcIdx = 0, unsigned Flag = 0) const; /// \brief Clear the specified flag on the instruction. - void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const; + void clearFlag(MachineInstr &MI, unsigned Operand, unsigned Flag) const; + + // Helper functions that check the opcode for status information + bool isRegisterStore(const MachineInstr &MI) const; + bool isRegisterLoad(const MachineInstr &MI) const; }; namespace AMDGPU { diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 33ef6a4e19ea..b6b576d95278 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -15,7 +15,7 @@ include "R600Intrinsics.td" include "R600InstrFormats.td" -class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> : +class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern = []> : InstR600 <outs, ins, asm, pattern, NullALU> { let Namespace = "AMDGPU"; @@ -160,7 +160,8 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern, let Inst{63-32} = Word1; } -class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node, +class R600_2OP_Helper <bits<11> inst, string opName, + SDPatternOperator node = null_frag, InstrItinClass itin = AnyALU> : R600_2OP <inst, opName, [(set R600_Reg32:$dst, (node R600_Reg32:$src0, @@ -283,7 +284,7 @@ class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, } class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern> - : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>, + : InstR600ISA <outs, (ins MEMxi:$src_gpr), !strconcat(" ", name), pattern>, VTX_WORD1_GPR { // Static fields @@ -328,18 +329,44 @@ class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern> class LoadParamFrag <PatFrag load_type> : PatFrag < (ops node:$ptr), (load_type node:$ptr), - [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }] + [{ return isConstantLoad(cast<LoadSDNode>(N), 0) || + (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }] >; def load_param : LoadParamFrag<load>; def load_param_exti8 : LoadParamFrag<az_extloadi8>; def load_param_exti16 : LoadParamFrag<az_extloadi16>; -def isR600 : Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::R700">; +class LoadVtxId1 <PatFrag load> : PatFrag < + (ops node:$ptr), (load node:$ptr), [{ + const MemSDNode *LD = cast<MemSDNode>(N); + return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + !isa<GlobalValue>(GetUnderlyingObject( + LD->getMemOperand()->getValue(), CurDAG->getDataLayout()))); +}]>; + +def vtx_id1_az_extloadi8 : LoadVtxId1 <az_extloadi8>; +def vtx_id1_az_extloadi16 : LoadVtxId1 <az_extloadi16>; +def vtx_id1_load : LoadVtxId1 <load>; + +class LoadVtxId2 <PatFrag load> : PatFrag < + (ops node:$ptr), (load node:$ptr), [{ + const MemSDNode *LD = cast<MemSDNode>(N); + return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + isa<GlobalValue>(GetUnderlyingObject( + LD->getMemOperand()->getValue(), CurDAG->getDataLayout())); +}]>; + +def vtx_id2_az_extloadi8 : LoadVtxId2 <az_extloadi8>; +def vtx_id2_az_extloadi16 : LoadVtxId2 <az_extloadi16>; +def vtx_id2_load : LoadVtxId2 <load>; + +def isR600 : Predicate<"Subtarget->getGeneration() <= R600Subtarget::R700">; def isR600toCayman : Predicate< - "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">; + "Subtarget->getGeneration() <= R600Subtarget::NORTHERN_ISLANDS">; //===----------------------------------------------------------------------===// // R600 SDNodes @@ -407,8 +434,7 @@ def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR, def INTERP_VEC_LOAD : AMDGPUShaderInst < (outs R600_Reg128:$dst), (ins i32imm:$src0), - "INTERP_LOAD $src0 : $dst", - [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>; + "INTERP_LOAD $src0 : $dst">; def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { let bank_swizzle = 5; @@ -474,28 +500,6 @@ class ExportBufWord1 { } multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { - def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 0, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg), - (ExportInst - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $reg, sub0), - 0, 61, 7, 0, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy (i32 imm:$type)), - (ExportInst - (v4f32 (IMPLICIT_DEF)), imm:$type, 0, 7, 7, 7, 7, cf_inst, 0) - >; - - def : Pat<(int_R600_store_dummy 1), - (ExportInst - (v4f32 (IMPLICIT_DEF)), 1, 60, 7, 7, 7, 7, cf_inst, 0) - >; - def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 imm:$base), (i32 imm:$type), (i32 imm:$swz_x), (i32 imm:$swz_y), (i32 imm:$swz_z), (i32 imm:$swz_w)), (ExportInst R600_Reg128:$src, imm:$type, imm:$base, @@ -507,22 +511,22 @@ multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> { multiclass SteamOutputExportPattern<Instruction ExportInst, bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> { // Stream0 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 0), (i32 imm:$mask)), (ExportInst R600_Reg128:$src, 0, imm:$arraybase, 4095, imm:$mask, buf0inst, 0)>; // Stream1 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 1), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf1inst, 0)>; // Stream2 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 2), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf2inst, 0)>; // Stream3 - def : Pat<(int_R600_store_stream_output (v4f32 R600_Reg128:$src), + def : Pat<(int_r600_store_stream_output (v4f32 R600_Reg128:$src), (i32 imm:$arraybase), (i32 3), (i32 imm:$mask)), (ExportInst $src, 0, imm:$arraybase, 4095, imm:$mask, buf3inst, 0)>; @@ -678,7 +682,7 @@ let Predicates = [isR600toCayman] in { def ADD : R600_2OP_Helper <0x0, "ADD", fadd>; // Non-IEEE MUL: 0 * anything = 0 -def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; +def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE">; def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; // TODO: Do these actually match the regular fmin/fmax behavior? def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; @@ -733,6 +737,7 @@ def SETNE_DX10 : R600_2OP < [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE_NE))] >; +// FIXME: Need combine for AMDGPUfract def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>; def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>; def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>; @@ -758,6 +763,13 @@ def : Pat < (MOV_IMM_I32 imm:$val) >; +def MOV_IMM_GLOBAL_ADDR : MOV_IMM<iPTR, i32imm>; +def : Pat < + (AMDGPUconstdata_ptr tglobaladdr:$addr), + (MOV_IMM_GLOBAL_ADDR tglobaladdr:$addr) +>; + + def MOV_IMM_F32 : MOV_IMM<f32, f32imm>; def : Pat < (fpimm:$val), @@ -851,7 +863,7 @@ class R600_TEX <bits<11> inst, string opName> : i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z, CT:$COORD_TYPE_W), - !strconcat(opName, + !strconcat(" ", opName, " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, " "$SRC_GPR.$srcx$srcy$srcz$srcw " "RID:$RESOURCE_ID SID:$SAMPLER_ID " @@ -1099,14 +1111,13 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper < // Clamped to maximum. class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamp > { let Itinerary = TransALU; } class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper < - inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy -> { + inst, "RECIPSQRT_IEEE", AMDGPUrsq> { let Itinerary = TransALU; } @@ -1135,11 +1146,6 @@ def FNEG_R600 : FNEG<R600_Reg32>; // FIXME: Should be predicated on unsafe fp math. multiclass DIV_Common <InstR600 recip_ieee> { def : Pat< - (int_AMDGPU_div f32:$src0, f32:$src1), - (MUL_IEEE $src0, (recip_ieee $src1)) ->; - -def : Pat< (fdiv f32:$src0, f32:$src1), (MUL_IEEE $src0, (recip_ieee $src1)) >; @@ -1147,12 +1153,6 @@ def : Pat< def : RcpPat<recip_ieee, f32>; } -class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> - : Pat < - (int_TGSI_lit_z f32:$src_x, f32:$src_y, f32:$src_w), - (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) ->; - //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1191,7 +1191,6 @@ let Predicates = [isR600] in { defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>; def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>; - def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def : RsqPat<RECIPSQRT_IEEE_r600, f32>; @@ -1332,9 +1331,7 @@ def TXD: InstR600 < (outs R600_Reg128:$dst), (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), - "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, imm:$textureTarget))], + "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", [], NullALU > { let TEXInst = 1; } @@ -1344,10 +1341,7 @@ def TXD_SHADOW: InstR600 < (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget), "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget", - [(set v4f32:$dst, (int_AMDGPU_txd v4f32:$src0, v4f32:$src1, v4f32:$src2, - imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))], - NullALU -> { + [], NullALU> { let TEXInst = 1; } } // End isPseudo = 1 @@ -1426,8 +1420,7 @@ def TEX_VTX_CONSTBUF : } def TEX_VTX_TEXBUF: - InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr", - [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>, + InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr">, VTX_WORD1_GPR, VTX_WORD0_eg { let VC_INST = 0; @@ -1542,8 +1535,9 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { //===---------------------------------------------------------------------===// let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(IL_retflag)]>; + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(AMDGPUendpgm)] + >; } //===----------------------------------------------------------------------===// @@ -1729,12 +1723,6 @@ def : DwordAddrPat <i32, R600_Reg32>; } // End isR600toCayman Predicate -let Predicates = [isR600] in { -// Intrinsic patterns -defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>; -defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>; -} // End isR600 - def getLDSNoRetOp : InstrMapping { let FilterClass = "R600_LDS_1A1D"; let RowFields = ["BaseOp"]; diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td index 9681747006d9..a5310e9fd6d0 100644 --- a/lib/Target/AMDGPU/R600Intrinsics.td +++ b/lib/Target/AMDGPU/R600Intrinsics.td @@ -11,65 +11,57 @@ // //===----------------------------------------------------------------------===// -let TargetPrefix = "R600", isTarget = 1 in { - class TextureIntrinsicFloatInput : - Intrinsic<[llvm_v4f32_ty], [ - llvm_v4f32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; - class TextureIntrinsicInt32Input : - Intrinsic<[llvm_v4i32_ty], [ - llvm_v4i32_ty, // Coord - llvm_i32_ty, // offset_x - llvm_i32_ty, // offset_y, - llvm_i32_ty, // offset_z, - llvm_i32_ty, // resource_id - llvm_i32_ty, // samplerid - llvm_i32_ty, // coord_type_x - llvm_i32_ty, // coord_type_y - llvm_i32_ty, // coord_type_z - llvm_i32_ty // coord_type_w - ], [IntrNoMem]>; +class TextureIntrinsicFloatInput : Intrinsic<[llvm_v4f32_ty], [ + llvm_v4f32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty], // coord_type_w + [IntrNoMem] +>; - def int_R600_load_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_input : - Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_interp_const : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>; -def int_R600_interp_xy : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; -def int_R600_interp_zw : - Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_R600_load_texbuf : - Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_R600_tex : TextureIntrinsicFloatInput; - def int_R600_texc : TextureIntrinsicFloatInput; - def int_R600_txl : TextureIntrinsicFloatInput; - def int_R600_txlc : TextureIntrinsicFloatInput; - def int_R600_txb : TextureIntrinsicFloatInput; - def int_R600_txbc : TextureIntrinsicFloatInput; - def int_R600_txf : TextureIntrinsicInt32Input; - def int_R600_ldptr : TextureIntrinsicInt32Input; - def int_R600_txq : TextureIntrinsicInt32Input; - def int_R600_ddx : TextureIntrinsicFloatInput; - def int_R600_ddy : TextureIntrinsicFloatInput; - def int_R600_store_swizzle : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_stream_output : - Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; - def int_R600_store_pixel_depth : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_pixel_stencil : - Intrinsic<[], [llvm_float_ty], []>; - def int_R600_store_dummy : - Intrinsic<[], [llvm_i32_ty], []>; -} +class TextureIntrinsicInt32Input : Intrinsic<[llvm_v4i32_ty], [ + llvm_v4i32_ty, // Coord + llvm_i32_ty, // offset_x + llvm_i32_ty, // offset_y, + llvm_i32_ty, // offset_z, + llvm_i32_ty, // resource_id + llvm_i32_ty, // samplerid + llvm_i32_ty, // coord_type_x + llvm_i32_ty, // coord_type_y + llvm_i32_ty, // coord_type_z + llvm_i32_ty], // coord_type_w + [IntrNoMem] +>; + +let TargetPrefix = "r600", isTarget = 1 in { + +def int_r600_store_swizzle : + Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [] +>; + +def int_r600_store_stream_output : Intrinsic< + [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] +>; + +def int_r600_tex : TextureIntrinsicFloatInput; +def int_r600_texc : TextureIntrinsicFloatInput; +def int_r600_txl : TextureIntrinsicFloatInput; +def int_r600_txlc : TextureIntrinsicFloatInput; +def int_r600_txb : TextureIntrinsicFloatInput; +def int_r600_txbc : TextureIntrinsicFloatInput; +def int_r600_txf : TextureIntrinsicInt32Input; +def int_r600_txq : TextureIntrinsicInt32Input; +def int_r600_ddx : TextureIntrinsicFloatInput; +def int_r600_ddy : TextureIntrinsicFloatInput; + +def int_r600_dot4 : Intrinsic<[llvm_float_ty], + [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem] +>; + +} // End TargetPrefix = "r600", isTarget = 1 diff --git a/lib/Target/AMDGPU/R600MachineFunctionInfo.h b/lib/Target/AMDGPU/R600MachineFunctionInfo.h index 263561edd30d..04a4436ebe03 100644 --- a/lib/Target/AMDGPU/R600MachineFunctionInfo.h +++ b/lib/Target/AMDGPU/R600MachineFunctionInfo.h @@ -10,17 +10,16 @@ /// \file //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_R600MACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_AMDGPU_R600MACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" -#include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/SelectionDAG.h" #include <vector> namespace llvm { -class R600MachineFunctionInfo : public AMDGPUMachineFunction { +class R600MachineFunctionInfo final : public AMDGPUMachineFunction { void anchor() override; public: R600MachineFunctionInfo(const MachineFunction &MF); diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index bcde5fb50dac..db18e5bd1afa 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "R600MachineScheduler.h" +#include "R600InstrInfo.h" #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Pass.h" @@ -26,7 +27,7 @@ using namespace llvm; void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); DAG = static_cast<ScheduleDAGMILive*>(dag); - const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>(); + const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>(); TII = static_cast<const R600InstrInfo*>(DAG->TII); TRI = static_cast<const R600RegisterInfo*>(DAG->TRI); VLIW5 = !ST.hasCaymanISA(); @@ -48,8 +49,7 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, QSrc.clear(); } -static -unsigned getWFCountLimitedByGPR(unsigned GPRCount) { +static unsigned getWFCountLimitedByGPR(unsigned GPRCount) { assert (GPRCount && "GPRCount cannot be 0"); return 248 / GPRCount; } @@ -222,75 +222,74 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg, R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { MachineInstr *MI = SU->getInstr(); - if (TII->isTransOnly(MI)) + if (TII->isTransOnly(*MI)) return AluTrans; - switch (MI->getOpcode()) { - case AMDGPU::PRED_X: - return AluPredX; - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: - return AluT_XYZW; - case AMDGPU::COPY: - if (MI->getOperand(1).isUndef()) { - // MI will become a KILL, don't considers it in scheduling - return AluDiscarded; - } - default: - break; - } - - // Does the instruction take a whole IG ? - // XXX: Is it possible to add a helper function in R600InstrInfo that can - // be used here and in R600PacketizerList::isSoloInstruction() ? - if(TII->isVector(*MI) || - TII->isCubeOp(MI->getOpcode()) || - TII->isReductionOp(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::GROUP_BARRIER) { - return AluT_XYZW; + switch (MI->getOpcode()) { + case AMDGPU::PRED_X: + return AluPredX; + case AMDGPU::INTERP_PAIR_XY: + case AMDGPU::INTERP_PAIR_ZW: + case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: + return AluT_XYZW; + case AMDGPU::COPY: + if (MI->getOperand(1).isUndef()) { + // MI will become a KILL, don't considers it in scheduling + return AluDiscarded; } + default: + break; + } - if (TII->isLDSInstr(MI->getOpcode())) { - return AluT_X; - } + // Does the instruction take a whole IG ? + // XXX: Is it possible to add a helper function in R600InstrInfo that can + // be used here and in R600PacketizerList::isSoloInstruction() ? + if(TII->isVector(*MI) || + TII->isCubeOp(MI->getOpcode()) || + TII->isReductionOp(MI->getOpcode()) || + MI->getOpcode() == AMDGPU::GROUP_BARRIER) { + return AluT_XYZW; + } - // Is the result already assigned to a channel ? - unsigned DestSubReg = MI->getOperand(0).getSubReg(); - switch (DestSubReg) { - case AMDGPU::sub0: - return AluT_X; - case AMDGPU::sub1: - return AluT_Y; - case AMDGPU::sub2: - return AluT_Z; - case AMDGPU::sub3: - return AluT_W; - default: - break; - } + if (TII->isLDSInstr(MI->getOpcode())) { + return AluT_X; + } - // Is the result already member of a X/Y/Z/W class ? - unsigned DestReg = MI->getOperand(0).getReg(); - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || - regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) - return AluT_X; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) - return AluT_Y; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) - return AluT_Z; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) - return AluT_W; - if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) - return AluT_XYZW; - - // LDS src registers cannot be used in the Trans slot. - if (TII->readsLDSSrcReg(MI)) - return AluT_XYZW; - - return AluAny; + // Is the result already assigned to a channel ? + unsigned DestSubReg = MI->getOperand(0).getSubReg(); + switch (DestSubReg) { + case AMDGPU::sub0: + return AluT_X; + case AMDGPU::sub1: + return AluT_Y; + case AMDGPU::sub2: + return AluT_Z; + case AMDGPU::sub3: + return AluT_W; + default: + break; + } + // Is the result already member of a X/Y/Z/W class ? + unsigned DestReg = MI->getOperand(0).getReg(); + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + return AluT_X; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + return AluT_Y; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + return AluT_Z; + if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + return AluT_W; + if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + return AluT_XYZW; + + // LDS src registers cannot be used in the Trans slot. + if (TII->readsLDSSrcReg(*MI)) + return AluT_XYZW; + + return AluAny; } int R600SchedStrategy::getInstKind(SUnit* SU) { @@ -324,9 +323,8 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { It != E; ++It) { SUnit *SU = *It; InstructionsGroupCandidate.push_back(SU->getInstr()); - if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) - && (!AnyALU || !TII->isVectorOnly(SU->getInstr())) - ) { + if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) && + (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) { InstructionsGroupCandidate.pop_back(); Q.erase((It + 1).base()); return SU; @@ -350,7 +348,7 @@ void R600SchedStrategy::PrepareNextSlot() { DEBUG(dbgs() << "New Slot\n"); assert (OccupedSlotsMask && "Slot wasn't filled"); OccupedSlotsMask = 0; -// if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS) +// if (HwGen == R600Subtarget::NORTHERN_ISLANDS) // OccupedSlotsMask |= 16; InstructionsGroupCandidate.clear(); LoadAlu(); diff --git a/lib/Target/AMDGPU/R600MachineScheduler.h b/lib/Target/AMDGPU/R600MachineScheduler.h index fc5b95c28e71..16d5d939708c 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.h +++ b/lib/Target/AMDGPU/R600MachineScheduler.h @@ -12,20 +12,19 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H -#define LLVM_LIB_TARGET_R600_R600MACHINESCHEDULER_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_R600MACHINESCHEDULER_H -#include "R600InstrInfo.h" -#include "llvm/ADT/PriorityQueue.h" #include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/Support/Debug.h" using namespace llvm; namespace llvm { -class R600SchedStrategy : public MachineSchedStrategy { +class R600InstrInfo; +struct R600RegisterInfo; +class R600SchedStrategy final : public MachineSchedStrategy { const ScheduleDAGMILive *DAG; const R600InstrInfo *TII; const R600RegisterInfo *TRI; diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 5efb3b9fc20e..ecae27d2233d 100644 --- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -29,6 +29,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "R600Defines.h" #include "R600InstrInfo.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineDominators.h" @@ -210,9 +211,9 @@ MachineInstr *R600VectorRegMerger::RebuildVector( (void)Tmp; SrcVec = DstReg; } - Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg) - .addReg(SrcVec); - DEBUG(dbgs() << " ->"; Pos->dump();); + MachineInstr *NewMI = + BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec); + DEBUG(dbgs() << " ->"; NewMI->dump();); DEBUG(dbgs() << " Updating Swizzle:\n"); for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg), @@ -224,11 +225,11 @@ MachineInstr *R600VectorRegMerger::RebuildVector( RSI->Instr->eraseFromParent(); // Update RSI - RSI->Instr = Pos; + RSI->Instr = NewMI; RSI->RegToChan = UpdatedRegToChan; RSI->UndefReg = UpdatedUndef; - return Pos; + return NewMI; } void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { @@ -314,8 +315,13 @@ void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) { } bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast<const R600InstrInfo *>(Fn.getSubtarget().getInstrInfo()); - MRI = &(Fn.getRegInfo()); + if (skipFunction(*Fn.getFunction())) + return false; + + const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>(); + TII = ST.getInstrInfo(); + MRI = &Fn.getRegInfo(); + for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { MachineBasicBlock *MB = &*MBB; @@ -325,10 +331,10 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); MII != MIIE; ++MII) { - MachineInstr *MI = MII; - if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) { - if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { - unsigned Reg = MI->getOperand(1).getReg(); + MachineInstr &MI = *MII; + if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) { + if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { + unsigned Reg = MI.getOperand(1).getReg(); for (MachineRegisterInfo::def_instr_iterator It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end(); It != E; ++It) { @@ -338,17 +344,17 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { continue; } - - RegSeqInfo RSI(*MRI, MI); + RegSeqInfo RSI(*MRI, &MI); // All uses of MI are swizzeable ? - unsigned Reg = MI->getOperand(0).getReg(); + unsigned Reg = MI.getOperand(0).getReg(); if (!areAllUsesSwizzeable(Reg)) continue; - DEBUG (dbgs() << "Trying to optimize "; - MI->dump(); - ); + DEBUG({ + dbgs() << "Trying to optimize "; + MI.dump(); + }); RegSeqInfo CandidateRSI; std::vector<std::pair<unsigned, unsigned> > RemapChan; diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp index 21269613a305..c84866469ae8 100644 --- a/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/lib/Target/AMDGPU/R600Packetizer.cpp @@ -56,15 +56,14 @@ public: char R600Packetizer::ID = 0; class R600PacketizerList : public VLIWPacketizerList { - private: const R600InstrInfo *TII; const R600RegisterInfo &TRI; bool VLIW5; bool ConsideredInstUsesAlreadyWrittenVectorElement; - unsigned getSlot(const MachineInstr *MI) const { - return TRI.getHWRegChan(MI->getOperand(0).getReg()); + unsigned getSlot(const MachineInstr &MI) const { + return TRI.getHWRegChan(MI.getOperand(0).getReg()); } /// \returns register to PV chan mapping for bundle/single instructions that @@ -81,11 +80,11 @@ private: int LastDstChan = -1; do { bool isTrans = false; - int BISlot = getSlot(&*BI); + int BISlot = getSlot(*BI); if (LastDstChan >= BISlot) isTrans = true; LastDstChan = BISlot; - if (TII->isPredicated(&*BI)) + if (TII->isPredicated(*BI)) continue; int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) @@ -95,7 +94,7 @@ private: continue; } unsigned Dst = BI->getOperand(DstIdx).getReg(); - if (isTrans || TII->isTransOnly(&*BI)) { + if (isTrans || TII->isTransOnly(*BI)) { Result[Dst] = AMDGPU::PS; continue; } @@ -129,7 +128,7 @@ private: return Result; } - void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs) + void substitutePV(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PVs) const { unsigned Ops[] = { AMDGPU::OpName::src0, @@ -137,23 +136,23 @@ private: AMDGPU::OpName::src2 }; for (unsigned i = 0; i < 3; i++) { - int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]); + int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); if (OperandIdx < 0) continue; - unsigned Src = MI->getOperand(OperandIdx).getReg(); + unsigned Src = MI.getOperand(OperandIdx).getReg(); const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src); if (It != PVs.end()) - MI->getOperand(OperandIdx).setReg(It->second); + MI.getOperand(OperandIdx).setReg(It->second); } } public: // Ctor. - R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) + R600PacketizerList(MachineFunction &MF, const R600Subtarget &ST, + MachineLoopInfo &MLI) : VLIWPacketizerList(MF, MLI, nullptr), - TII(static_cast<const R600InstrInfo *>( - MF.getSubtarget().getInstrInfo())), + TII(ST.getInstrInfo()), TRI(TII->getRegisterInfo()) { - VLIW5 = !MF.getSubtarget<AMDGPUSubtarget>().hasCaymanISA(); + VLIW5 = !ST.hasCaymanISA(); } // initPacketizerState - initialize some internal flags. @@ -162,32 +161,30 @@ public: } // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(const MachineInstr *MI, + bool ignorePseudoInstruction(const MachineInstr &MI, const MachineBasicBlock *MBB) override { return false; } // isSoloInstruction - return true if instruction MI can not be packetized // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(const MachineInstr *MI) override { - if (TII->isVector(*MI)) + bool isSoloInstruction(const MachineInstr &MI) override { + if (TII->isVector(MI)) return true; - if (!TII->isALUInstr(MI->getOpcode())) + if (!TII->isALUInstr(MI.getOpcode())) return true; - if (MI->getOpcode() == AMDGPU::GROUP_BARRIER) + if (MI.getOpcode() == AMDGPU::GROUP_BARRIER) return true; // XXX: This can be removed once the packetizer properly handles all the // LDS instruction group restrictions. - if (TII->isLDSInstr(MI->getOpcode())) - return true; - return false; + return TII->isLDSInstr(MI.getOpcode()); } // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ // together. bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override { MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr(); - if (getSlot(MII) == getSlot(MIJ)) + if (getSlot(*MII) == getSlot(*MIJ)) ConsideredInstUsesAlreadyWrittenVectorElement = true; // Does MII and MIJ share the same pred_sel ? int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), @@ -210,14 +207,12 @@ public: } } - bool ARDef = TII->definesAddressRegister(MII) || - TII->definesAddressRegister(MIJ); - bool ARUse = TII->usesAddressRegister(MII) || - TII->usesAddressRegister(MIJ); - if (ARDef && ARUse) - return false; + bool ARDef = + TII->definesAddressRegister(*MII) || TII->definesAddressRegister(*MIJ); + bool ARUse = + TII->usesAddressRegister(*MII) || TII->usesAddressRegister(*MIJ); - return true; + return !ARDef || !ARUse; } // isLegalToPruneDependencies - Is it legal to prune dependece between SUI @@ -231,7 +226,7 @@ public: MI->getOperand(LastOp).setImm(Bit); } - bool isBundlableWithCurrentPMI(MachineInstr *MI, + bool isBundlableWithCurrentPMI(MachineInstr &MI, const DenseMap<unsigned, unsigned> &PV, std::vector<R600InstrInfo::BankSwizzle> &BS, bool &isTransSlot) { @@ -240,11 +235,14 @@ public: // Is the dst reg sequence legal ? if (!isTransSlot && !CurrentPacketMIs.empty()) { - if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) { - if (ConsideredInstUsesAlreadyWrittenVectorElement && + if (getSlot(MI) <= getSlot(*CurrentPacketMIs.back())) { + if (ConsideredInstUsesAlreadyWrittenVectorElement && !TII->isVectorOnly(MI) && VLIW5) { isTransSlot = true; - DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump();); + DEBUG({ + dbgs() << "Considering as Trans Inst :"; + MI.dump(); + }); } else return false; @@ -252,18 +250,18 @@ public: } // Are the Constants limitations met ? - CurrentPacketMIs.push_back(MI); + CurrentPacketMIs.push_back(&MI); if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) { - DEBUG( + DEBUG({ dbgs() << "Couldn't pack :\n"; - MI->dump(); + MI.dump(); dbgs() << "with the following packets :\n"; for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { CurrentPacketMIs[i]->dump(); dbgs() << "\n"; } dbgs() << "because of Consts read limitations\n"; - ); + }); CurrentPacketMIs.pop_back(); return false; } @@ -271,16 +269,16 @@ public: // Is there a BankSwizzle set that meet Read Port limitations ? if (!TII->fitsReadPortLimitations(CurrentPacketMIs, PV, BS, isTransSlot)) { - DEBUG( + DEBUG({ dbgs() << "Couldn't pack :\n"; - MI->dump(); + MI.dump(); dbgs() << "with the following packets :\n"; for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) { CurrentPacketMIs[i]->dump(); dbgs() << "\n"; } dbgs() << "because of Read port limitations\n"; - ); + }); CurrentPacketMIs.pop_back(); return false; } @@ -293,9 +291,9 @@ public: return true; } - MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override { + MachineBasicBlock::iterator addToPacket(MachineInstr &MI) override { MachineBasicBlock::iterator FirstInBundle = - CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front(); + CurrentPacketMIs.empty() ? &MI : CurrentPacketMIs.front(); const DenseMap<unsigned, unsigned> &PV = getPreviousVector(FirstInBundle); std::vector<R600InstrInfo::BankSwizzle> BS; @@ -308,9 +306,9 @@ public: AMDGPU::OpName::bank_swizzle); MI->getOperand(Op).setImm(BS[i]); } - unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); - MI->getOperand(Op).setImm(BS.back()); + unsigned Op = + TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle); + MI.getOperand(Op).setImm(BS.back()); if (!CurrentPacketMIs.empty()) setIsLastBit(CurrentPacketMIs.back(), 0); substitutePV(MI, PV); @@ -320,7 +318,7 @@ public: } return It; } - endPacket(MI->getParent(), MI); + endPacket(MI.getParent(), MI); if (TII->isTransOnly(MI)) return MI; return VLIWPacketizerList::addToPacket(MI); @@ -328,15 +326,20 @@ public: }; bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); + const R600Subtarget &ST = Fn.getSubtarget<R600Subtarget>(); + const R600InstrInfo *TII = ST.getInstrInfo(); + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); // Instantiate the packetizer. - R600PacketizerList Packetizer(Fn, MLI); + R600PacketizerList Packetizer(Fn, ST, MLI); // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty()) + return false; + // // Loop over all basic blocks and remove KILL pseudo-instructions // These instructions confuse the dependence analysis. Consider: @@ -375,7 +378,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { // instruction stream until we find the nearest boundary. MachineBasicBlock::iterator I = RegionEnd; for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn)) + if (TII->isSchedulingBoundary(*std::prev(I), &*MBB, Fn)) break; } I = MBB->begin(); diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp index fb0359cfc651..dfdc602b80cd 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -28,8 +28,8 @@ R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); - const R600InstrInfo *TII = - static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); + const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>(); + const R600InstrInfo *TII = ST.getInstrInfo(); Reserved.set(AMDGPU::ZERO); Reserved.set(AMDGPU::HALF); @@ -89,3 +89,10 @@ bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const { return true; } } + +void R600RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, + int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS) const { + llvm_unreachable("Subroutines not supported yet"); +} diff --git a/lib/Target/AMDGPU/R600RegisterInfo.h b/lib/Target/AMDGPU/R600RegisterInfo.h index 4f8a129ce4a6..9dfb3106c6cc 100644 --- a/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/lib/Target/AMDGPU/R600RegisterInfo.h @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_R600REGISTERINFO_H -#define LLVM_LIB_TARGET_R600_R600REGISTERINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H +#define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H #include "AMDGPURegisterInfo.h" @@ -21,7 +21,7 @@ namespace llvm { class AMDGPUSubtarget; -struct R600RegisterInfo : public AMDGPURegisterInfo { +struct R600RegisterInfo final : public AMDGPURegisterInfo { RegClassWeight RCW; R600RegisterInfo(); @@ -31,7 +31,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { /// \brief get the HW encoding for a register's channel. unsigned getHWRegChan(unsigned reg) const; - unsigned getHWRegIndex(unsigned Reg) const override; + unsigned getHWRegIndex(unsigned Reg) const; /// \brief get the register class of the specified type to use in the /// CFGStructurizer @@ -40,8 +40,13 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { const RegClassWeight & getRegClassWeight(const TargetRegisterClass *RC) const override; - // \returns true if \p Reg can be defined in one ALU caluse and used in another. + // \returns true if \p Reg can be defined in one ALU clause and used in + // another. bool isPhysRegLiveAcrossClauses(unsigned Reg) const; + + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/R600Schedule.td b/lib/Target/AMDGPU/R600Schedule.td index df62bf85c0ad..70fb46c1a7d6 100644 --- a/lib/Target/AMDGPU/R600Schedule.td +++ b/lib/Target/AMDGPU/R600Schedule.td @@ -9,7 +9,7 @@ // // R600 has a VLIW architecture. On pre-cayman cards there are 5 instruction // slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS. For cayman cards, the TRANS -// slot has been removed. +// slot has been removed. // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp b/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp deleted file mode 100644 index 2fc7b02f673f..000000000000 --- a/lib/Target/AMDGPU/R600TextureIntrinsicsReplacer.cpp +++ /dev/null @@ -1,303 +0,0 @@ -//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass translates tgsi-like texture intrinsics into R600 texture -/// closer to hardware intrinsics. -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Passes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstVisitor.h" - -using namespace llvm; - -namespace { -class R600TextureIntrinsicsReplacer : - public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> { - static char ID; - - Module *Mod; - Type *FloatType; - Type *Int32Type; - Type *V4f32Type; - Type *V4i32Type; - FunctionType *TexSign; - FunctionType *TexQSign; - - void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD, - unsigned SrcSelect[4], unsigned CT[4], - bool &useShadowVariant) { - enum TextureTypes { - TEXTURE_1D = 1, - TEXTURE_2D, - TEXTURE_3D, - TEXTURE_CUBE, - TEXTURE_RECT, - TEXTURE_SHADOW1D, - TEXTURE_SHADOW2D, - TEXTURE_SHADOWRECT, - TEXTURE_1D_ARRAY, - TEXTURE_2D_ARRAY, - TEXTURE_SHADOW1D_ARRAY, - TEXTURE_SHADOW2D_ARRAY, - TEXTURE_SHADOWCUBE, - TEXTURE_2D_MSAA, - TEXTURE_2D_ARRAY_MSAA, - TEXTURE_CUBE_ARRAY, - TEXTURE_SHADOWCUBE_ARRAY - }; - - switch (TextureType) { - case 0: - useShadowVariant = false; - return; - case TEXTURE_RECT: - case TEXTURE_1D: - case TEXTURE_2D: - case TEXTURE_3D: - case TEXTURE_CUBE: - case TEXTURE_1D_ARRAY: - case TEXTURE_2D_ARRAY: - case TEXTURE_CUBE_ARRAY: - case TEXTURE_2D_MSAA: - case TEXTURE_2D_ARRAY_MSAA: - useShadowVariant = false; - break; - case TEXTURE_SHADOW1D: - case TEXTURE_SHADOW2D: - case TEXTURE_SHADOWRECT: - case TEXTURE_SHADOW1D_ARRAY: - case TEXTURE_SHADOW2D_ARRAY: - case TEXTURE_SHADOWCUBE: - case TEXTURE_SHADOWCUBE_ARRAY: - useShadowVariant = true; - break; - default: - llvm_unreachable("Unknow Texture Type"); - } - - if (TextureType == TEXTURE_RECT || - TextureType == TEXTURE_SHADOWRECT) { - CT[0] = 0; - CT[1] = 0; - } - - if (TextureType == TEXTURE_CUBE_ARRAY || - TextureType == TEXTURE_SHADOWCUBE_ARRAY) - CT[2] = 0; - - if (TextureType == TEXTURE_1D_ARRAY || - TextureType == TEXTURE_SHADOW1D_ARRAY) { - if (hasLOD && useShadowVariant) { - CT[1] = 0; - } else { - CT[2] = 0; - SrcSelect[2] = 1; - } - } else if (TextureType == TEXTURE_2D_ARRAY || - TextureType == TEXTURE_SHADOW2D_ARRAY) { - CT[2] = 0; - } - - if ((TextureType == TEXTURE_SHADOW1D || - TextureType == TEXTURE_SHADOW2D || - TextureType == TEXTURE_SHADOWRECT || - TextureType == TEXTURE_SHADOW1D_ARRAY) && - !(hasLOD && useShadowVariant)) - SrcSelect[3] = 2; - } - - void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name, - unsigned SrcSelect[4], Value *Offset[3], Value *Resource, - Value *Sampler, unsigned CT[4], Value *Coord) { - IRBuilder<> Builder(&I); - Constant *Mask[] = { - ConstantInt::get(Int32Type, SrcSelect[0]), - ConstantInt::get(Int32Type, SrcSelect[1]), - ConstantInt::get(Int32Type, SrcSelect[2]), - ConstantInt::get(Int32Type, SrcSelect[3]) - }; - Value *SwizzleMask = ConstantVector::get(Mask); - Value *SwizzledCoord = - Builder.CreateShuffleVector(Coord, Coord, SwizzleMask); - - Value *Args[] = { - SwizzledCoord, - Offset[0], - Offset[1], - Offset[2], - Resource, - Sampler, - ConstantInt::get(Int32Type, CT[0]), - ConstantInt::get(Int32Type, CT[1]), - ConstantInt::get(Int32Type, CT[2]), - ConstantInt::get(Int32Type, CT[3]) - }; - - Function *F = Mod->getFunction(Name); - if (!F) { - F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod); - F->addFnAttr(Attribute::ReadNone); - } - I.replaceAllUsesWith(Builder.CreateCall(F, Args)); - I.eraseFromParent(); - } - - void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT, - const char *VanillaInt, - const char *ShadowInt) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(1); - Value *SamplerId = I.getArgOperand(2); - - unsigned TextureType = - cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0), - ConstantInt::get(Int32Type, 0) - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - - void ReplaceTXF(CallInst &I) { - Value *Coord = I.getArgOperand(0); - Value *ResourceId = I.getArgOperand(4); - Value *SamplerId = I.getArgOperand(5); - - unsigned TextureType = - cast<ConstantInt>(I.getArgOperand(6))->getZExtValue(); - - unsigned SrcSelect[4] = { 0, 1, 2, 3 }; - unsigned CT[4] = {1, 1, 1, 1}; - Value *Offset[3] = { - I.getArgOperand(1), - I.getArgOperand(2), - I.getArgOperand(3), - }; - bool useShadowVariant; - - getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT, - useShadowVariant); - - ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect, - Offset, ResourceId, SamplerId, CT, Coord); - } - -public: - R600TextureIntrinsicsReplacer(): - FunctionPass(ID) { - } - - bool doInitialization(Module &M) override { - LLVMContext &Ctx = M.getContext(); - Mod = &M; - FloatType = Type::getFloatTy(Ctx); - Int32Type = Type::getInt32Ty(Ctx); - V4f32Type = VectorType::get(FloatType, 4); - V4i32Type = VectorType::get(Int32Type, 4); - Type *ArgsType[] = { - V4f32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false); - Type *ArgsQType[] = { - V4i32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - Int32Type, - }; - TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false); - return false; - } - - bool runOnFunction(Function &F) override { - visit(F); - return false; - } - - const char *getPassName() const override { - return "R600 Texture Intrinsics Replacer"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - } - - void visitCallInst(CallInst &I) { - if (!I.getCalledFunction()) - return; - - StringRef Name = I.getCalledFunction()->getName(); - if (Name == "llvm.AMDGPU.tex") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc"); - return; - } - if (Name == "llvm.AMDGPU.txl") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc"); - return; - } - if (Name == "llvm.AMDGPU.txb") { - ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc"); - return; - } - if (Name == "llvm.AMDGPU.txf") { - ReplaceTXF(I); - return; - } - if (Name == "llvm.AMDGPU.txq") { - ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq"); - return; - } - if (Name == "llvm.AMDGPU.ddx") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx"); - return; - } - if (Name == "llvm.AMDGPU.ddy") { - ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy"); - return; - } - } - -}; - -char R600TextureIntrinsicsReplacer::ID = 0; - -} - -FunctionPass *llvm::createR600TextureIntrinsicsReplacer() { - return new R600TextureIntrinsicsReplacer(); -} diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index fa4d24a2f25a..5f182c5304c6 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -14,6 +14,7 @@ #include "AMDGPU.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" @@ -34,17 +35,16 @@ typedef std::pair<BasicBlock *, Value *> StackEntry; typedef SmallVector<StackEntry, 16> StackVector; // Intrinsic names the control flow is annotated with -static const char *const IfIntrinsic = "llvm.SI.if"; -static const char *const ElseIntrinsic = "llvm.SI.else"; -static const char *const BreakIntrinsic = "llvm.SI.break"; -static const char *const IfBreakIntrinsic = "llvm.SI.if.break"; -static const char *const ElseBreakIntrinsic = "llvm.SI.else.break"; -static const char *const LoopIntrinsic = "llvm.SI.loop"; -static const char *const EndCfIntrinsic = "llvm.SI.end.cf"; +static const char *const IfIntrinsic = "llvm.amdgcn.if"; +static const char *const ElseIntrinsic = "llvm.amdgcn.else"; +static const char *const BreakIntrinsic = "llvm.amdgcn.break"; +static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break"; +static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break"; +static const char *const LoopIntrinsic = "llvm.amdgcn.loop"; +static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf"; class SIAnnotateControlFlow : public FunctionPass { - - static char ID; + DivergenceAnalysis *DA; Type *Boolean; Type *Void; @@ -69,6 +69,8 @@ class SIAnnotateControlFlow : public FunctionPass { LoopInfo *LI; + bool isUniform(BranchInst *T); + bool isTopOfStack(BasicBlock *BB); Value *popSaved(); @@ -83,13 +85,16 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - Value *handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L); + Value *handleLoopCondition(Value *Cond, PHINode *Broken, + llvm::Loop *L, BranchInst *Term); void handleLoop(BranchInst *Term); void closeControlFlow(BasicBlock *BB); public: + static char ID; + SIAnnotateControlFlow(): FunctionPass(ID) { } @@ -104,6 +109,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<DivergenceAnalysis>(); AU.addPreserved<DominatorTreeWrapperPass>(); FunctionPass::getAnalysisUsage(AU); } @@ -112,6 +118,12 @@ public: } // end anonymous namespace +INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE, + "Annotate SI Control Flow", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE, + "Annotate SI Control Flow", false, false) + char SIAnnotateControlFlow::ID = 0; /// \brief Initialize all the types and constants used in the pass @@ -152,6 +164,13 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { return false; } +/// \brief Is the branch condition uniform or did the StructurizeCFG pass +/// consider it as such? +bool SIAnnotateControlFlow::isUniform(BranchInst *T) { + return DA->isUniform(T->getCondition()) || + T->getMetadata("structurizecfg.uniform") != nullptr; +} + /// \brief Is BB the last block saved on the stack ? bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { return !Stack.empty() && Stack.back().first == BB; @@ -194,6 +213,9 @@ void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { /// \brief Open a new "If" block void SIAnnotateControlFlow::openIf(BranchInst *Term) { + if (isUniform(Term)) { + return; + } Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -201,6 +223,9 @@ void SIAnnotateControlFlow::openIf(BranchInst *Term) { /// \brief Close the last "If" block and open a new "Else" block void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + if (isUniform(Term)) { + return; + } Value *Ret = CallInst::Create(Else, popSaved(), "", Term); Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); @@ -208,7 +233,7 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { /// \brief Recursively handle the condition leading to a loop Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L) { + llvm::Loop *L, BranchInst *Term) { // Only search through PHI nodes which are inside the loop. If we try this // with PHI nodes that are outside of the loop, we end up inserting new PHI @@ -232,7 +257,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, } Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L); + Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term); NewPhi->addIncoming(PhiArg, From); } @@ -246,7 +271,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, BasicBlock *From = Phi->getIncomingBlock(i); if (From == IDom) { + // We're in the following situation: + // IDom/From + // | \ + // | If-block + // | / + // Parent + // where we want to break out of the loop if the If-block is not taken. + // Due to the depth-first traversal, there should be an end.cf + // intrinsic in Parent, and we insert an else.break before it. + // + // Note that the end.cf need not be the first non-phi instruction + // of parent, particularly when we're dealing with a multi-level + // break, but it should occur within a group of intrinsic calls + // at the beginning of the block. CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); + while (OldEnd && OldEnd->getCalledFunction() != EndCf) + OldEnd = dyn_cast<CallInst>(OldEnd->getNextNode()); if (OldEnd && OldEnd->getCalledFunction() == EndCf) { Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); @@ -271,14 +312,23 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken, Value *Args[] = { Cond, Broken }; return CallInst::Create(IfBreak, Args, "", Insert); + // Insert IfBreak before TERM for constant COND. + } else if (isa<ConstantInt>(Cond)) { + Value *Args[] = { Cond, Broken }; + return CallInst::Create(IfBreak, Args, "", Term); + } else { llvm_unreachable("Unhandled loop condition!"); } - return 0; + return nullptr; } /// \brief Handle a back edge (loop) void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + if (isUniform(Term)) { + return; + } + BasicBlock *BB = Term->getParent(); llvm::Loop *L = LI->getLoopFor(BB); BasicBlock *Target = Term->getSuccessor(1); @@ -286,7 +336,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L); + Value *Arg = handleLoopCondition(Cond, Broken, L, Term); for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); PI != PE; ++PI) { @@ -300,6 +350,8 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { llvm::Loop *L = LI->getLoopFor(BB); + assert(Stack.back().first == BB); + if (L && L->getHeader() == BB) { // We can't insert an EndCF call into a loop header, because it will // get executed on every iteration of the loop, when it should be @@ -315,14 +367,18 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } - CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); + Value *Exec = popSaved(); + if (!isa<UndefValue>(Exec)) + CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt()); } /// \brief Annotate the control flow with intrinsics so the backend can /// recognize if/then/else and loops. bool SIAnnotateControlFlow::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + DA = &getAnalysis<DivergenceAnalysis>(); for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { @@ -332,12 +388,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) { if (!Term || Term->isUnconditional()) { if (isTopOfStack(*I)) closeControlFlow(*I); + continue; } if (I.nodeVisited(Term->getSuccessor(1))) { if (isTopOfStack(*I)) closeControlFlow(*I); + handleLoop(Term); continue; } diff --git a/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp new file mode 100644 index 000000000000..65ceff3930ac --- /dev/null +++ b/lib/Target/AMDGPU/SIDebuggerInsertNops.cpp @@ -0,0 +1,96 @@ +//===--- SIDebuggerInsertNops.cpp - Inserts nops for debugger usage -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Inserts one nop instruction for each high level source statement for +/// debugger usage. +/// +/// Tools, such as a debugger, need to pause execution based on user input (i.e. +/// breakpoint). In order to do this, one nop instruction is inserted before the +/// first isa instruction of each high level source statement. Further, the +/// debugger may replace nop instructions with trap instructions based on user +/// input. +// +//===----------------------------------------------------------------------===// + +#include "SIInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +using namespace llvm; + +#define DEBUG_TYPE "si-debugger-insert-nops" +#define PASS_NAME "SI Debugger Insert Nops" + +namespace { + +class SIDebuggerInsertNops : public MachineFunctionPass { +public: + static char ID; + + SIDebuggerInsertNops() : MachineFunctionPass(ID) { } + const char *getPassName() const override { return PASS_NAME; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // anonymous namespace + +INITIALIZE_PASS(SIDebuggerInsertNops, DEBUG_TYPE, PASS_NAME, false, false) + +char SIDebuggerInsertNops::ID = 0; +char &llvm::SIDebuggerInsertNopsID = SIDebuggerInsertNops::ID; + +FunctionPass *llvm::createSIDebuggerInsertNopsPass() { + return new SIDebuggerInsertNops(); +} + +bool SIDebuggerInsertNops::runOnMachineFunction(MachineFunction &MF) { + // Skip this pass if "amdgpu-debugger-insert-nops" attribute was not + // specified. + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + if (!ST.debuggerInsertNops()) + return false; + + // Skip machine functions without debug info. + if (!MF.getMMI().hasDebugInfo()) + return false; + + // Target instruction info. + const SIInstrInfo *TII = ST.getInstrInfo(); + + // Set containing line numbers that have nop inserted. + DenseSet<unsigned> NopInserted; + + for (auto &MBB : MF) { + for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { + // Skip DBG_VALUE instructions and instructions without location. + if (MI->isDebugValue() || !MI->getDebugLoc()) + continue; + + // Insert nop instruction if line number does not have nop inserted. + auto DL = MI->getDebugLoc(); + if (NopInserted.find(DL.getLine()) == NopInserted.end()) { + BuildMI(MBB, *MI, DL, TII->get(AMDGPU::S_NOP)) + .addImm(0); + NopInserted.insert(DL.getLine()); + } + } + } + + return true; +} diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index aa1e352ed748..54efdc0a0466 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -10,8 +10,8 @@ #include "llvm/MC/MCInstrDesc.h" -#ifndef LLVM_LIB_TARGET_R600_SIDEFINES_H -#define LLVM_LIB_TARGET_R600_SIDEFINES_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H +#define LLVM_LIB_TARGET_AMDGPU_SIDEFINES_H namespace SIInstrFlags { // This needs to be kept in sync with the field bits in InstSI. @@ -29,16 +29,19 @@ enum { VOP2 = 1 << 11, VOP3 = 1 << 12, VOPC = 1 << 13, + SDWA = 1 << 14, + DPP = 1 << 15, - MUBUF = 1 << 14, - MTBUF = 1 << 15, - SMRD = 1 << 16, - DS = 1 << 17, - MIMG = 1 << 18, - FLAT = 1 << 19, - WQM = 1 << 20, - VGPRSpill = 1 << 21, - VOPAsmPrefer32Bit = 1 << 22 + MUBUF = 1 << 16, + MTBUF = 1 << 17, + SMRD = 1 << 18, + DS = 1 << 19, + MIMG = 1 << 20, + FLAT = 1 << 21, + WQM = 1 << 22, + VGPRSpill = 1 << 23, + VOPAsmPrefer32Bit = 1 << 24, + Gather4 = 1 << 25 }; } @@ -46,9 +49,14 @@ namespace llvm { namespace AMDGPU { enum OperandType { /// Operand with register or 32-bit immediate - OPERAND_REG_IMM32 = llvm::MCOI::OPERAND_FIRST_TARGET, + OPERAND_REG_IMM32 = MCOI::OPERAND_FIRST_TARGET, /// Operand with register or inline constant - OPERAND_REG_INLINE_C + OPERAND_REG_INLINE_C, + + /// Operand with 32-bit immediate that uses the constant bus. The standard + /// OPERAND_IMMEDIATE should be used for special immediates such as source + /// modifiers. + OPERAND_KIMM32 }; } } @@ -77,10 +85,13 @@ namespace SIInstrFlags { }; } +// Input operand modifiers bit-masks +// NEG and SEXT share same bit-mask because they can't be set simultaneously. namespace SISrcMods { enum { - NEG = 1 << 0, - ABS = 1 << 1 + NEG = 1 << 0, // Floating-point negate modifier + ABS = 1 << 1, // Floating-point absolute modifier + SEXT = 1 << 0 // Integer sign-extend modifier }; } @@ -93,6 +104,109 @@ namespace SIOutMods { }; } +namespace llvm { +namespace AMDGPU { +namespace EncValues { // Encoding values of enum9/8/7 operands + +enum { + SGPR_MIN = 0, + SGPR_MAX = 101, + TTMP_MIN = 112, + TTMP_MAX = 123, + INLINE_INTEGER_C_MIN = 128, + INLINE_INTEGER_C_POSITIVE_MAX = 192, // 64 + INLINE_INTEGER_C_MAX = 208, + INLINE_FLOATING_C_MIN = 240, + INLINE_FLOATING_C_MAX = 248, + LITERAL_CONST = 255, + VGPR_MIN = 256, + VGPR_MAX = 511 +}; + +} // namespace EncValues +} // namespace AMDGPU +} // namespace llvm + +namespace llvm { +namespace AMDGPU { +namespace SendMsg { // Encoding of SIMM16 used in s_sendmsg* insns. + +enum Id { // Message ID, width(4) [3:0]. + ID_UNKNOWN_ = -1, + ID_INTERRUPT = 1, + ID_GS, + ID_GS_DONE, + ID_SYSMSG = 15, + ID_GAPS_LAST_, // Indicate that sequence has gaps. + ID_GAPS_FIRST_ = ID_INTERRUPT, + ID_SHIFT_ = 0, + ID_WIDTH_ = 4, + ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) +}; + +enum Op { // Both GS and SYS operation IDs. + OP_UNKNOWN_ = -1, + OP_SHIFT_ = 4, + // width(2) [5:4] + OP_GS_NOP = 0, + OP_GS_CUT, + OP_GS_EMIT, + OP_GS_EMIT_CUT, + OP_GS_LAST_, + OP_GS_FIRST_ = OP_GS_NOP, + OP_GS_WIDTH_ = 2, + OP_GS_MASK_ = (((1 << OP_GS_WIDTH_) - 1) << OP_SHIFT_), + // width(3) [6:4] + OP_SYS_ECC_ERR_INTERRUPT = 1, + OP_SYS_REG_RD, + OP_SYS_HOST_TRAP_ACK, + OP_SYS_TTRACE_PC, + OP_SYS_LAST_, + OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT, + OP_SYS_WIDTH_ = 3, + OP_SYS_MASK_ = (((1 << OP_SYS_WIDTH_) - 1) << OP_SHIFT_) +}; + +enum StreamId { // Stream ID, (2) [9:8]. + STREAM_ID_DEFAULT_ = 0, + STREAM_ID_LAST_ = 4, + STREAM_ID_FIRST_ = STREAM_ID_DEFAULT_, + STREAM_ID_SHIFT_ = 8, + STREAM_ID_WIDTH_= 2, + STREAM_ID_MASK_ = (((1 << STREAM_ID_WIDTH_) - 1) << STREAM_ID_SHIFT_) +}; + +} // namespace SendMsg + +namespace Hwreg { // Encoding of SIMM16 used in s_setreg/getreg* insns. + +enum Id { // HwRegCode, (6) [5:0] + ID_UNKNOWN_ = -1, + ID_SYMBOLIC_FIRST_ = 1, // There are corresponding symbolic names defined. + ID_SYMBOLIC_LAST_ = 8, + ID_SHIFT_ = 0, + ID_WIDTH_ = 6, + ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) +}; + +enum Offset { // Offset, (5) [10:6] + OFFSET_DEFAULT_ = 0, + OFFSET_SHIFT_ = 6, + OFFSET_WIDTH_ = 5, + OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_) +}; + +enum WidthMinusOne { // WidthMinusOne, (5) [15:11] + WIDTH_M1_DEFAULT_ = 31, + WIDTH_M1_SHIFT_ = 11, + WIDTH_M1_WIDTH_ = 5, + WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_) +}; + +} // namespace Hwreg +} // namespace AMDGPU +} // namespace llvm + #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 #define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C #define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) @@ -134,7 +248,7 @@ namespace SIOutMods { #define C_00B84C_LDS_SIZE 0xFF007FFF #define S_00B84C_EXCP_EN(x) (((x) & 0x7F) << 24) #define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F) -#define C_00B84C_EXCP_EN +#define C_00B84C_EXCP_EN #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC #define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 @@ -194,5 +308,7 @@ namespace SIOutMods { #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 #define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_SPILLED_SGPRS 0x4 +#define R_SPILLED_VGPRS 0x8 #endif diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f59d9948f98e..9e0086b79087 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -77,7 +77,7 @@ using namespace llvm; -#define DEBUG_TYPE "sgpr-copies" +#define DEBUG_TYPE "si-fix-sgpr-copies" namespace { @@ -237,11 +237,10 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, } bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); SmallVector<MachineInstr *, 16> Worklist; diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp deleted file mode 100644 index 8bda283f0fca..000000000000 --- a/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp +++ /dev/null @@ -1,219 +0,0 @@ -//===-- SIFixSGPRLiveRanges.cpp - Fix SGPR live ranges ----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file SALU instructions ignore the execution mask, so we need to modify the -/// live ranges of the registers they define in some cases. -/// -/// The main case we need to handle is when a def is used in one side of a -/// branch and not another. For example: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// -/// Here we need the register allocator to avoid assigning any of the defs -/// inside of the IF to the same register as %def. In traditional live -/// interval analysis %def is not live inside the IF branch, however, since -/// SALU instructions inside of IF will be executed even if the branch is not -/// taken, there is the chance that one of the instructions will overwrite the -/// value of %def, so the use in ELSE will see the wrong value. -/// -/// The strategy we use for solving this is to add an extra use after the ENDIF: -/// -/// %def -/// IF -/// ... -/// ... -/// ELSE -/// %use -/// ... -/// ENDIF -/// %use -/// -/// Adding this use will make the def live throughout the IF branch, which is -/// what we want. - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/LiveVariables.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-fix-sgpr-live-ranges" - -namespace { - -class SIFixSGPRLiveRanges : public MachineFunctionPass { -public: - static char ID; - -public: - SIFixSGPRLiveRanges() : MachineFunctionPass(ID) { - initializeSIFixSGPRLiveRangesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI Fix SGPR live ranges"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LiveVariables>(); - AU.addPreserved<LiveVariables>(); - - AU.addRequired<MachinePostDominatorTree>(); - AU.addPreserved<MachinePostDominatorTree>(); - AU.setPreservesCFG(); - - MachineFunctionPass::getAnalysisUsage(AU); - } -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) -INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, - "SI Fix SGPR Live Ranges", false, false) - -char SIFixSGPRLiveRanges::ID = 0; - -char &llvm::SIFixSGPRLiveRangesID = SIFixSGPRLiveRanges::ID; - -FunctionPass *llvm::createSIFixSGPRLiveRangesPass() { - return new SIFixSGPRLiveRanges(); -} - -bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { - MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); - const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - bool MadeChange = false; - - MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); - SmallVector<unsigned, 16> SGPRLiveRanges; - - LiveVariables *LV = &getAnalysis<LiveVariables>(); - MachineBasicBlock *Entry = &MF.front(); - - // Use a depth first order so that in SSA, we encounter all defs before - // uses. Once the defs of the block have been found, attempt to insert - // SGPR_USE instructions in successor blocks if required. - for (MachineBasicBlock *MBB : depth_first(Entry)) { - for (const MachineInstr &MI : *MBB) { - for (const MachineOperand &MO : MI.defs()) { - // We should never see a live out def of a physical register, so we also - // do not need to worry about implicit_defs(). - unsigned Def = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Def)) { - if (TRI->isSGPRClass(MRI.getRegClass(Def))) { - // Only consider defs that are live outs. We don't care about def / - // use within the same block. - - // LiveVariables does not consider registers that are only used in a - // phi in a sucessor block as live out, unlike LiveIntervals. - // - // This is OK because SIFixSGPRCopies replaced any SGPR phis with - // VGPRs. - if (LV->isLiveOut(Def, *MBB)) - SGPRLiveRanges.push_back(Def); - } - } - } - } - - if (MBB->succ_size() < 2) - continue; - - // We have structured control flow, so the number of successors should be - // two. - assert(MBB->succ_size() == 2); - MachineBasicBlock *SuccA = *MBB->succ_begin(); - MachineBasicBlock *SuccB = *(++MBB->succ_begin()); - MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); - - if (!NCD) - continue; - - MachineBasicBlock::iterator NCDTerm = NCD->getFirstTerminator(); - - if (NCDTerm != NCD->end() && NCDTerm->getOpcode() == AMDGPU::SI_ELSE) { - assert(NCD->succ_size() == 2); - // We want to make sure we insert the Use after the ENDIF, not after - // the ELSE. - NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), - *(++NCD->succ_begin())); - } - - for (unsigned Reg : SGPRLiveRanges) { - // FIXME: We could be smarter here. If the register is Live-In to one - // block, but the other doesn't have any SGPR defs, then there won't be a - // conflict. Also, if the branch condition is uniform then there will be - // no conflict. - bool LiveInToA = LV->isLiveIn(Reg, *SuccA); - bool LiveInToB = LV->isLiveIn(Reg, *SuccB); - - if (!LiveInToA && !LiveInToB) { - DEBUG(dbgs() << PrintReg(Reg, TRI, 0) - << " is live into neither successor\n"); - continue; - } - - if (LiveInToA && LiveInToB) { - DEBUG(dbgs() << PrintReg(Reg, TRI, 0) - << " is live into both successors\n"); - continue; - } - - // This interval is live in to one successor, but not the other, so - // we need to update its range so it is live in to both. - DEBUG(dbgs() << "Possible SGPR conflict detected for " - << PrintReg(Reg, TRI, 0) - << " BB#" << SuccA->getNumber() - << ", BB#" << SuccB->getNumber() - << " with NCD = BB#" << NCD->getNumber() << '\n'); - - assert(TargetRegisterInfo::isVirtualRegister(Reg) && - "Not expecting to extend live range of physreg"); - - // FIXME: Need to figure out how to update LiveRange here so this pass - // will be able to preserve LiveInterval analysis. - MachineInstr *NCDSGPRUse = - BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::SGPR_USE)) - .addReg(Reg, RegState::Implicit); - - MadeChange = true; - LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse); - - DEBUG(NCDSGPRUse->dump()); - } - } - - return MadeChange; -} diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 6230d1e28b74..4ecc0fcc6232 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -13,12 +13,9 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -44,8 +41,6 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -76,11 +71,8 @@ struct FoldCandidate { } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SIFoldOperands, DEBUG_TYPE, - "SI Fold Operands", false, false) +INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE, + "SI Fold Operands", false, false) char SIFoldOperands::ID = 0; @@ -140,7 +132,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { // Special case for v_mac_f32_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); @@ -167,7 +159,7 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, // see if this makes it possible to fold. unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; - bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); + bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1); if (CanCommute) { if (CommuteIdx0 == OpNo) @@ -185,10 +177,10 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, return false; if (!CanCommute || - !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) + !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1)) return false; - if (!TII->isOperandLegal(MI, OpNo, OpToFold)) + if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) return false; } @@ -301,9 +293,13 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, } bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 7d20509c464d..03b11f0fd38d 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -11,6 +11,8 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" +#include "AMDGPUSubtarget.h" + #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -21,24 +23,13 @@ using namespace llvm; static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, const MachineFrameInfo *FrameInfo) { - if (!FuncInfo->hasSpilledSGPRs()) - return false; - - if (FuncInfo->hasSpilledVGPRs()) - return false; - - for (int I = FrameInfo->getObjectIndexBegin(), - E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { - if (!FrameInfo->isSpillSlotObjectIndex(I)) - return false; - } - - return true; + return FuncInfo->hasSpilledSGPRs() && + (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects()); } static ArrayRef<MCPhysReg> getAllSGPR128() { - return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), - AMDGPU::SReg_128RegClass.getNumRegs()); + return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(), + AMDGPU::SGPR_128RegClass.getNumRegs()); } static ArrayRef<MCPhysReg> getAllSGPRs() { @@ -48,6 +39,12 @@ static ArrayRef<MCPhysReg> getAllSGPRs() { void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { + // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was + // specified. + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + if (ST.debuggerEmitPrologue()) + emitDebuggerPrologue(MF, MBB); + if (!MF.getFrameInfo()->hasStackObjects()) return; @@ -63,10 +60,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) return; - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineBasicBlock::iterator I = MBB.begin(); // We need to insert initialization of the scratch resource descriptor. unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); @@ -84,6 +81,46 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); } + if (MFI->hasFlatScratchInit()) { + // We don't need this if we only have spills since there is no user facing + // scratch. + + // TODO: If we know we don't have flat instructions earlier, we can omit + // this from the input registers. + // + // TODO: We only need to know if we access scratch space through a flat + // pointer. Because we only detect if flat instructions are used at all, + // this will be used more often than necessary on VI. + + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. + DebugLoc DL; + + unsigned FlatScratchInitReg + = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + + MRI.addLiveIn(FlatScratchInitReg); + MBB.addLiveIn(FlatScratchInitReg); + + // Copy the size in bytes. + unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO) + .addReg(FlatScrInitHi, RegState::Kill); + + unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); + + // Add wave offset in bytes to private base offset. + // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo) + .addReg(FlatScrInitLo) + .addReg(ScratchWaveOffsetReg); + + // Convert offset to 256-byte units. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI) + .addReg(FlatScrInitLo, RegState::Kill) + .addImm(8); + } + // If we reserved the original input registers, we don't need to copy to the // reserved registers. if (ScratchRsrcReg == PreloadedPrivateBufferReg) { @@ -96,7 +133,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, // We added live-ins during argument lowering, but since they were not used // they were deleted. We're adding the uses now, so add them back. - MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg); @@ -137,15 +173,28 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { MachineRegisterInfo &MRI = MF.getRegInfo(); - // Skip the last 2 elements because the last one is reserved for VCC, and - // this is the 2nd to last element already. unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + + // We need to drop register from the end of the list that we cannot use + // for the scratch wave offset. + // + 2 s102 and s103 do not exist on VI. + // + 2 for vcc + // + 2 for xnack_mask + // + 2 for flat_scratch + // + 4 for registers reserved for scratch resource register + // + 1 for register reserved for scratch wave offset. (By exluding this + // register from the list to consider, it means that when this + // register is being used for the scratch wave offset and there + // are no other free SGPRs, then the value will stay in this register. + // ---- + // 13 + for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) { // Pick the first unallocated SGPR. Be careful not to pick an alias of the // scratch descriptor, since we haven’t added its uses yet. if (!MRI.isPhysRegUsed(Reg)) { - assert(MRI.isAllocatable(Reg) && - !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + if (!MRI.isAllocatable(Reg) || + TRI->isSubRegisterEq(ScratchRsrcReg, Reg)) + continue; MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); ScratchWaveOffsetReg = Reg; @@ -160,7 +209,6 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); - MachineBasicBlock::iterator I = MBB.begin(); DebugLoc DL; if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { @@ -223,6 +271,11 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, } } +void SIFrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + +} + void SIFrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { @@ -243,3 +296,44 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( RS->addScavengingFrameIndex(ScavengeFI); } } + +void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Get work group ID SGPR, and make it live-in again. + unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); + MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); + MBB.addLiveIn(WorkGroupIDSGPR); + + // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in + // order to spill it to scratch. + unsigned WorkGroupIDVGPR = + MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) + .addReg(WorkGroupIDSGPR); + + // Spill work group ID. + int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, + WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + + // Get work item ID VGPR, and make it live-in again. + unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); + MF.getRegInfo().addLiveIn(WorkItemIDVGPR); + MBB.addLiveIn(WorkItemIDVGPR); + + // Spill work item ID. + int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, + WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + } +} diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h index a9152fd8b2aa..37417d098f31 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.h +++ b/lib/Target/AMDGPU/SIFrameLowering.h @@ -23,10 +23,16 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + void emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; + +private: + /// \brief Emits debugger prologue. + void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; }; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 544867513d9c..51241cf0a432 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -18,33 +18,46 @@ #include <cmath> #endif -#include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" -#include "llvm/ADT/SmallString.h" using namespace llvm; -SITargetLowering::SITargetLowering(TargetMachine &TM, - const AMDGPUSubtarget &STI) +// -amdgpu-fast-fdiv - Command line option to enable faster 2.5 ulp fdiv. +static cl::opt<bool> EnableAMDGPUFastFDIV( + "amdgpu-fast-fdiv", + cl::desc("Enable faster 2.5 ulp fdiv"), + cl::init(false)); + +static unsigned findFirstFreeSGPR(CCState &CCInfo) { + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { + if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) { + return AMDGPU::SGPR0 + Reg; + } + } + llvm_unreachable("Cannot allocate sgpr"); +} + +SITargetLowering::SITargetLowering(const TargetMachine &TM, + const SISubtarget &STI) : AMDGPUTargetLowering(TM, STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); - addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); @@ -66,34 +79,25 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, computeRegisterProperties(STI.getRegisterInfo()); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); - - setOperationAction(ISD::ADD, MVT::i32, Legal); - setOperationAction(ISD::ADDC, MVT::i32, Legal); - setOperationAction(ISD::ADDE, MVT::i32, Legal); - setOperationAction(ISD::SUBC, MVT::i32, Legal); - setOperationAction(ISD::SUBE, MVT::i32, Legal); - - setOperationAction(ISD::FSIN, MVT::f32, Custom); - setOperationAction(ISD::FCOS, MVT::f32, Custom); - - setOperationAction(ISD::FMINNUM, MVT::f64, Legal); - setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); - // We need to custom lower vector stores from local memory + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v4i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + + setOperationAction(ISD::SELECT, MVT::i1, Promote); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); @@ -102,109 +106,39 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i32, Expand); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); + setOperationAction(ISD::SETCC, MVT::i1, Promote); setOperationAction(ISD::SETCC, MVT::v2i1, Expand); setOperationAction(ISD::SETCC, MVT::v4i1, Expand); - setOperationAction(ISD::BSWAP, MVT::i32, Legal); - setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); - - setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); - - for (MVT VT : MVT::integer_valuetypes()) { - if (VT == MVT::i64) - continue; - - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand); - - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand); - } - - for (MVT VT : MVT::integer_vector_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand); - } - - for (MVT VT : MVT::fp_valuetypes()) - setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); - - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); - - setTruncStoreAction(MVT::i64, MVT::i32, Expand); - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); - setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); - - - setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); - - setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); - setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); - - setOperationAction(ISD::LOAD, MVT::i1, Custom); - - setOperationAction(ISD::LOAD, MVT::v2i64, Promote); - AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); - - setOperationAction(ISD::STORE, MVT::v2i64, Promote); - AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); - - setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); - - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::FrameIndex, MVT::i32, Custom); - - // These should use UDIVREM, so set them to expand - setOperationAction(ISD::UDIV, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); - setOperationAction(ISD::SELECT, MVT::i1, Promote); - - setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); - - - setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::BR_CC, MVT::i1, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Expand); + setOperationAction(ISD::BR_CC, MVT::i64, Expand); + setOperationAction(ISD::BR_CC, MVT::f32, Expand); + setOperationAction(ISD::BR_CC, MVT::f64, Expand); // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { - switch(Op) { + switch (Op) { case ISD::LOAD: case ISD::STORE: case ISD::BUILD_VECTOR: @@ -241,13 +175,46 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); + + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, + // and output demarshalling + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + + // We can't return success/failure, only the old value, + // let LLVM add the comparison + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + + if (getSubtarget()->hasFlatAddressSpace()) { + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); + } + + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + + // On SI this is s_memtime and s_memrealtime on VI. + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + setOperationAction(ISD::TRAP, MVT::Other, Custom); + + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + + if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); } setOperationAction(ISD::FFLOOR, MVT::f64, Legal); + + setOperationAction(ISD::FSIN, MVT::f32, Custom); + setOperationAction(ISD::FCOS, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); @@ -263,6 +230,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FCANONICALIZE); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -287,10 +255,33 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setSchedulingPreference(Sched::RegPressure); } +const SISubtarget *SITargetLowering::getSubtarget() const { + return static_cast<const SISubtarget *>(Subtarget); +} + //===----------------------------------------------------------------------===// // TargetLowering queries //===----------------------------------------------------------------------===// +bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &CI, + unsigned IntrID) const { + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = true; + return true; + default: + return false; + } +} + bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no @@ -348,7 +339,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: { - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. // FIXME: This assumption is currently wrong. On VI we still use @@ -376,16 +367,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (DL.getTypeStoreSize(Ty) < 4) return isLegalMUBUFAddressingMode(AM); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // SMRD instructions have an 8-bit, dword offset on SI. if (!isUInt<8>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) { // On CI+, this can also be a 32-bit literal constant offset. If it fits // in 8-bits, it can use a smaller encoding. if (!isUInt<32>(AM.BaseOffs / 4)) return false; - } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) { // On VI, these use the SMEM format and the offset is 20-bit in bytes. if (!isUInt<20>(AM.BaseOffs)) return false; @@ -402,7 +393,6 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: return isLegalMUBUFAddressingMode(AM); case AMDGPUAS::LOCAL_ADDRESS: @@ -423,6 +413,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; } case AMDGPUAS::FLAT_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + // For an unknown address space, this usually means that this is for some + // reason being used for pure arithmetic, and not based on some addressing + // computation. We don't have instructions that compute pointers with any + // addressing modes, so treat them as having no offset like flat + // instructions. return isLegalFlatAddressingMode(AM); default: @@ -442,24 +438,30 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, if (!VT.isSimple() || VT == MVT::Other) return false; - // TODO - CI+ supports unaligned memory accesses, but this requires driver - // support. - - // XXX - The only mention I see of this in the ISA manual is for LDS direct - // reads the "byte address and must be dword aligned". Is it also true for the - // normal loads and stores? - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) { + if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || + AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. bool AlignedBy4 = (Align % 4 == 0); if (IsFast) *IsFast = AlignedBy4; + return AlignedBy4; } + if (Subtarget->hasUnalignedBufferAccess()) { + // If we have an uniform constant load, it still requires using a slow + // buffer instruction if unaligned. + if (IsFast) { + *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ? + (Align % 4 == 0) : true; + } + + return true; + } + // Smaller than dword value must be aligned. - // FIXME: This should be allowed on CI+ if (VT.bitsLT(MVT::i32)) return false; @@ -500,21 +502,22 @@ static bool isFlatGlobalAddrSpace(unsigned AS) { bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { - return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); } - bool SITargetLowering::isMemOpUniform(const SDNode *N) const { const MemSDNode *MemNode = cast<MemSDNode>(N); const Value *Ptr = MemNode->getMemOperand()->getValue(); // UndefValue means this is a load of a kernel input. These are uniform. - // Sometimes LDS instructions have constant pointers - if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) || - isa<GlobalValue>(Ptr)) + // Sometimes LDS instructions have constant pointers. + // If Ptr is null, then that means this mem operand contains a + // PseudoSourceValue like GOT. + if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || + isa<Constant>(Ptr) || isa<GlobalValue>(Ptr)) return true; - const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); + const Instruction *I = dyn_cast<Instruction>(Ptr); return I && I->getMetadata("amdgpu.uniform"); } @@ -528,29 +531,42 @@ SITargetLowering::getPreferredVectorAction(EVT VT) const { bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); return TII->isInlineConstant(Imm); } -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - SDLoc SL, SDValue Chain, - unsigned Offset, bool Signed) const { +bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { + + // SimplifySetCC uses this function to determine whether or not it should + // create setcc with i1 operands. We don't have instructions for i1 setcc. + if (VT == MVT::i1 && Op == ISD::SETCC) + return false; + + return TargetLowering::isTypeDesirableForOp(Op, VT); +} + +SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, + const SDLoc &SL, SDValue Chain, + unsigned Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg), PtrVT); - SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(Offset, SL, PtrVT)); + return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, + DAG.getConstant(Offset, SL, PtrVT)); +} +SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + unsigned Offset, bool Signed) const { + const DataLayout &DL = DAG.getDataLayout(); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); SDValue PtrOffset = DAG.getUNDEF(PtrVT); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); @@ -560,34 +576,35 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, if (MemVT.isFloatingPoint()) ExtTy = ISD::EXTLOAD; - return DAG.getLoad(ISD::UNINDEXED, ExtTy, - VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment + SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); + return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, + PtrInfo, MemVT, Align, MachineMemOperand::MONonTemporal | + MachineMemOperand::MOInvariant); } SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) const { - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { const Function *Fn = MF.getFunction(); - DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DiagnosticInfoUnsupported NoGraphicsHSA( + *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); DAG.getContext()->diagnose(NoGraphicsHSA); - return SDValue(); + return DAG.getEntryNode(); } - // FIXME: We currently assume all calling conventions are kernels. + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); SmallVector<ISD::InputArg, 16> Splits; BitVector Skipped(Ins.size()); @@ -596,7 +613,7 @@ SDValue SITargetLowering::LowerFormalArguments( const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && + if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && !Arg.Flags.isByVal() && PSInputNum <= 15) { if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { @@ -613,25 +630,26 @@ SDValue SITargetLowering::LowerFormalArguments( ++PSInputNum; } - // Second split vertices into their elements - if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); + if (AMDGPU::isShader(CallConv)) { + // Second split vertices into their elements + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eight. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); + } + } else { + Splits.push_back(Arg); } - - } else if (Info->getShaderType() != ShaderType::COMPUTE) { - Splits.push_back(Arg); } } @@ -651,19 +669,27 @@ SDValue SITargetLowering::LowerFormalArguments( // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be // enabled too. - if (Info->getShaderType() == ShaderType::PIXEL && + if (CallConv == CallingConv::AMDGPU_PS && ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && - Info->isPSInputAllocated(11)))) { + ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { CCInfo.AllocateReg(AMDGPU::VGPR0); CCInfo.AllocateReg(AMDGPU::VGPR1); Info->markPSInputAllocated(0); Info->PSInputEna |= 1; } - if (Info->getShaderType() == ShaderType::COMPUTE) { + if (!AMDGPU::isShader(CallConv)) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); + + assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); + } else { + assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && + !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? @@ -679,12 +705,24 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(DispatchPtrReg); } + if (Info->hasQueuePtr()) { + unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(QueuePtrReg); + } + if (Info->hasKernargSegmentPtr()) { unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); CCInfo.AllocateReg(InputPtrReg); } + if (Info->hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(FlatScratchInitReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector<SDValue, 16> Chains; @@ -713,7 +751,7 @@ SDValue SITargetLowering::LowerFormalArguments( auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be @@ -765,7 +803,7 @@ SDValue SITargetLowering::LowerFormalArguments( NumElements = Arg.VT.getVectorNumElements() - NumElements; Regs.append(NumElements, DAG.getUNDEF(VT)); - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs)); + InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); continue; } @@ -780,8 +818,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = Info->addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("work group id x is always enabled"); + } if (Info->hasWorkGroupIDY()) { unsigned Reg = Info->addWorkGroupIDY(); @@ -803,8 +840,13 @@ SDValue SITargetLowering::LowerFormalArguments( if (Info->hasPrivateSegmentWaveByteOffset()) { // Scratch wave offset passed in system SGPR. - unsigned PrivateSegmentWaveByteOffsetReg - = Info->addPrivateSegmentWaveByteOffset(); + unsigned PrivateSegmentWaveByteOffsetReg; + + if (AMDGPU::isShader(CallConv)) { + PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); + Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + } else + PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); @@ -812,8 +854,11 @@ SDValue SITargetLowering::LowerFormalArguments( // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. - bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to + // check all stack objects later. + if (HasStackObjects) + Info->setHasNonSpillStackObjects(true); if (ST.isAmdHsaOS()) { // TODO: Assume we will spill without optimizations. @@ -866,8 +911,7 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); CCInfo.AllocateReg(Reg); - } else - llvm_unreachable("workitem id x should always be enabled"); + } if (Info->hasWorkItemIDY()) { unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); @@ -887,16 +931,16 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } -SDValue SITargetLowering::LowerReturn(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const { +SDValue +SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SDLoc &DL, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - if (Info->getShaderType() == ShaderType::COMPUTE) + if (!AMDGPU::isShader(CallConv)) return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, OutVals, DL, DAG); @@ -975,17 +1019,131 @@ SDValue SITargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); + unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN; + return DAG.getNode(Opc, DL, MVT::Other, RetOps); } -MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( - MachineInstr * MI, MachineBasicBlock * BB) const { - - switch (MI->getOpcode()) { +unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const { + unsigned Reg = StringSwitch<unsigned>(RegName) + .Case("m0", AMDGPU::M0) + .Case("exec", AMDGPU::EXEC) + .Case("exec_lo", AMDGPU::EXEC_LO) + .Case("exec_hi", AMDGPU::EXEC_HI) + .Case("flat_scratch", AMDGPU::FLAT_SCR) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) + .Default(AMDGPU::NoRegister); + + if (Reg == AMDGPU::NoRegister) { + report_fatal_error(Twine("invalid register name \"" + + StringRef(RegName) + "\".")); + + } + + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { + report_fatal_error(Twine("invalid register \"" + + StringRef(RegName) + "\" for subtarget.")); + } + + switch (Reg) { + case AMDGPU::M0: + case AMDGPU::EXEC_LO: + case AMDGPU::EXEC_HI: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + if (VT.getSizeInBits() == 32) + return Reg; + break; + case AMDGPU::EXEC: + case AMDGPU::FLAT_SCR: + if (VT.getSizeInBits() == 64) + return Reg; + break; default: - return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + llvm_unreachable("missing register type checking"); + } + + report_fatal_error(Twine("invalid type for register \"" + + StringRef(RegName) + "\".")); +} + +// If kill is not the last instruction, split the block so kill is always a +// proper terminator. +MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator SplitPoint(&MI); + ++SplitPoint; + + if (SplitPoint == BB->end()) { + // Don't bother with a new block. + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return BB; + } + + MachineFunction *MF = BB->getParent(); + MachineBasicBlock *SplitBB + = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + // Fix the block phi references to point to the new block for the defs in the + // second piece of the block. + for (MachineBasicBlock *Succ : BB->successors()) { + for (MachineInstr &MI : *Succ) { + if (!MI.isPHI()) + break; + + for (unsigned I = 2, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &FromBB = MI.getOperand(I); + if (BB == FromBB.getMBB()) { + FromBB.setMBB(SplitBB); + break; + } + } + } + } + + MF->insert(++MachineFunction::iterator(BB), SplitBB); + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + + SplitBB->transferSuccessors(BB); + BB->addSuccessor(SplitBB); + + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return SplitBB; +} + +MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr &MI, MachineBasicBlock *BB) const { + switch (MI.getOpcode()) { + case AMDGPU::SI_INIT_M0: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(), + TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addOperand(MI.getOperand(0)); + MI.eraseFromParent(); + break; + } case AMDGPU::BRANCH: return BB; + case AMDGPU::GET_GROUPSTATICSIZE: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineFunction *MF = BB->getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32)) + .addOperand(MI.getOperand(0)) + .addImm(MFI->LDSSize); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::SI_KILL: + return splitKillBlock(MI, BB); + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } return BB; } @@ -1072,6 +1230,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); @@ -1079,7 +1238,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerGlobalAddress(MFI, Op, DAG); } case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); + case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); + case ISD::TRAP: return lowerTRAP(Op, DAG); } return SDValue(); } @@ -1106,25 +1268,78 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FINode->getIndex(); - // A FrameIndex node represents a 32-bit offset into scratch memory. If - // the high bit of a frame index offset were to be set, this would mean - // that it represented an offset of ~2GB * 64 = ~128GB from the start of the - // scratch buffer, with 64 being the number of threads per wave. + // A FrameIndex node represents a 32-bit offset into scratch memory. If the + // high bit of a frame index offset were to be set, this would mean that it + // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch + // buffer, with 64 being the number of threads per wave. // - // If we know the machine uses less than 128GB of scratch, then we can - // amrk the high bit of the FrameIndex node as known zero, - // which is important, because it means in most situations we can - // prove that values derived from FrameIndex nodes are non-negative. - // This enables us to take advantage of more addressing modes when - // accessing scratch buffers, since for scratch reads/writes, the register - // offset must always be positive. + // The maximum private allocation for the entire GPU is 4G, and we are + // concerned with the largest the index could ever be for an individual + // workitem. This will occur with the minmum dispatch size. If a program + // requires more, the dispatch size will be reduced. + // + // With this limit, we can mark the high bit of the FrameIndex node as known + // zero, which is important, because it means in most situations we can prove + // that values derived from FrameIndex nodes are non-negative. This enables us + // to take advantage of more addressing modes when accessing scratch buffers, + // since for scratch reads/writes, the register offset must always be + // positive. - SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - if (Subtarget->enableHugeScratchBuffer()) - return TFI; + uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; + // XXX - It is unclear if partial dispatch works. Assume it works at half wave + // granularity. It is probably a full wave. + uint64_t MinGranularity = 32; + + unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); + + SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); + DAG.getValueType(ExtVT)); +} + +bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { + if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return false; + + switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) { + default: return false; + case AMDGPUIntrinsic::amdgcn_if: + case AMDGPUIntrinsic::amdgcn_else: + case AMDGPUIntrinsic::amdgcn_break: + case AMDGPUIntrinsic::amdgcn_if_break: + case AMDGPUIntrinsic::amdgcn_else_break: + case AMDGPUIntrinsic::amdgcn_loop: + case AMDGPUIntrinsic::amdgcn_end_cf: + return true; + } +} + +void SITargetLowering::createDebuggerPrologueStackObjects( + MachineFunction &MF) const { + // Create stack objects that are used for emitting debugger prologue. + // + // Debugger prologue writes work group IDs and work item IDs to scratch memory + // at fixed location in the following format: + // offset 0: work group ID x + // offset 4: work group ID y + // offset 8: work group ID z + // offset 16: work item ID x + // offset 20: work item ID y + // offset 24: work item ID z + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + int ObjectIdx = 0; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Create fixed stack object for work group ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true); + Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); + // Create fixed stack object for work item ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true); + Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); + } } /// This transforms the control flow intrinsics to get the branch destination as @@ -1137,13 +1352,11 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SDNode *Intr = BRCOND.getOperand(1).getNode(); SDValue Target = BRCOND.getOperand(2); SDNode *BR = nullptr; + SDNode *SetCC = nullptr; if (Intr->getOpcode() == ISD::SETCC) { // As long as we negate the condition everything is fine - SDNode *SetCC = Intr; - assert(SetCC->getConstantOperandVal(1) == 1); - assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == - ISD::SETNE); + SetCC = Intr; Intr = SetCC->getOperand(0).getNode(); } else { @@ -1152,7 +1365,15 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, Target = BR->getOperand(1); } - assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + if (!isCFIntrinsic(Intr)) { + // This is a uniform branch so we don't need to legalize. + return BRCOND; + } + + assert(!SetCC || + (SetCC->getConstantOperandVal(1) == 1 && + cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == + ISD::SETNE)); // Build the result and ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); @@ -1204,37 +1425,185 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::getSegmentAperture(unsigned AS, + SelectionDAG &DAG) const { + SDLoc SL; + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + assert(UserSGPR != AMDGPU::NoRegister); + + SDValue QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + + // Offset into amd_queue_t for group_segment_aperture_base_hi / + // private_segment_aperture_base_hi. + uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; + + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr, + DAG.getConstant(StructOffset, SL, MVT::i64)); + + // TODO: Use custom target PseudoSourceValue. + // TODO: We should use the value from the IR intrinsic call, but it might not + // be available and how do we get it? + Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), + AMDGPUAS::CONSTANT_ADDRESS)); + + MachinePointerInfo PtrInfo(V, StructOffset); + return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo, + MinAlign(64, StructOffset), + MachineMemOperand::MOInvariant); +} + +SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op); + + SDValue Src = ASC->getOperand(0); + + // FIXME: Really support non-0 null pointers. + SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32); + SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64); + + // flat -> local/private + if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE); + SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + + return DAG.getNode(ISD::SELECT, SL, MVT::i32, + NonNull, Ptr, SegmentNullPtr); + } + } + + // local/private -> flat + if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { + SDValue NonNull + = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE); + + SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG); + SDValue CvtPtr + = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture); + + return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, + DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr), + FlatNullPtr); + } + } + + // global <-> flat are no-ops and never emitted. + + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported InvalidAddrSpaceCast( + *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc()); + DAG.getContext()->diagnose(InvalidAddrSpaceCast); + + return DAG.getUNDEF(ASC->getValueType(0)); +} + +static bool shouldEmitGOTReloc(const GlobalValue *GV, + const TargetMachine &TM) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); +} + +bool +SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { + // We can fold offsets for anything that doesn't require a GOT relocation. + return GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && + !shouldEmitGOTReloc(GA->getGlobal(), getTargetMachine()); +} + +static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, + SDLoc DL, unsigned Offset, EVT PtrVT, + unsigned GAFlags = SIInstrInfo::MO_NONE) { + // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is + // lowered to the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // a fixup or relocation is emitted to replace $symbol with a literal + // constant, which is a pc-relative offset from the encoding of the $symbol + // operand to the global variable. + // + // What we want here is an offset from the value returned by s_getpc + // (which is the address of the s_add_u32 instruction) to the global + // variable, but since the encoding of $symbol starts 4 bytes after the start + // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order to + // compute the correct address. + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, + GAFlags); + return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, GA); +} + SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); - if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) + if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); const GlobalValue *GV = GSD->getGlobal(); - MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); + EVT PtrVT = Op.getValueType(); + + if (!shouldEmitGOTReloc(GV, getTargetMachine())) + return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); + + SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT, + SIInstrInfo::MO_GOTPCREL); + + Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + const DataLayout &DataLayout = DAG.getDataLayout(); + unsigned Align = DataLayout.getABITypeAlignment(PtrTy); + // FIXME: Use a PseudoSourceValue once those can be assigned an address space. + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); + return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, + MachineMemOperand::MOInvariant); } -SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, - SDValue V) const { +SDValue SITargetLowering::lowerTRAP(SDValue Op, + SelectionDAG &DAG) const { + const MachineFunction &MF = DAG.getMachineFunction(); + DiagnosticInfoUnsupported NoTrap(*MF.getFunction(), + "trap handler not supported", + Op.getDebugLoc(), + DS_Warning); + DAG.getContext()->diagnose(NoTrap); + + // Emit s_endpgm. + + // FIXME: This should really be selected to s_trap, but that requires + // setting up the trap handler for it o do anything. + return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other, + Op.getOperand(0)); +} + +SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, + const SDLoc &DL, SDValue V) const { + // We can't use S_MOV_B32 directly, because there is no way to specify m0 as + // the destination register. + // // We can't use CopyToReg, because MachineCSE won't combine COPY instructions, // so we will end up with redundant moves to m0. // - // We can't use S_MOV_B32, because there is no way to specify m0 as the - // destination register. - // - // We have to use them both. Machine cse will combine all the S_MOV_B32 - // instructions and the register coalescer eliminate the extra copies. - SDNode *M0 = DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, V.getValueType(), V); - return DAG.getCopyToReg(Chain, DL, DAG.getRegister(AMDGPU::M0, MVT::i32), - SDValue(M0, 0), SDValue()); // Glue - // A Null SDValue creates - // a glue result. + // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result. + + // A Null SDValue creates a glue result. + SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue, + V, Chain); + return SDValue(M0, 0); } SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, @@ -1249,12 +1618,27 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, DAG.getValueType(VT)); } +static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "non-hsa intrinsic with hsa target", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + +static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "intrinsic not supported on subtarget", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -1264,62 +1648,134 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, switch (IntrinsicID) { case Intrinsic::amdgcn_dispatch_ptr: + case Intrinsic::amdgcn_queue_ptr: { if (!Subtarget->isAmdHsaOS()) { - DiagnosticInfoUnsupported BadIntrin(*MF.getFunction(), - "hsa intrinsic without hsa target"); + DiagnosticInfoUnsupported BadIntrin( + *MF.getFunction(), "unsupported hsa intrinsic without hsa target", + DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); return DAG.getUNDEF(VT); } + auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? + SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); - + TRI->getPreloadedValue(MF, Reg), VT); + } + case Intrinsic::amdgcn_implicitarg_ptr: { + unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); + return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + } + case Intrinsic::amdgcn_kernarg_segment_ptr: { + unsigned Reg + = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + } + case Intrinsic::amdgcn_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq: + case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq_legacy: { + if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + return emitRemovedIntrinsicError(DAG, DL, VT); + + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + } + case Intrinsic::amdgcn_rsq_clamp: { + if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); + + Type *Type = VT.getTypeForEVT(*DAG.getContext()); + APFloat Max = APFloat::getLargest(Type->getFltSemantics()); + APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); + + SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, + DAG.getConstantFP(Max, DL, VT)); + return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp, + DAG.getConstantFP(Min, DL, VT)); + } case Intrinsic::r600_read_ngroups_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: + if (Subtarget->isAmdHsaOS()) + return emitNonHSAIntrinsicError(DAG, DL, VT); + return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); - case Intrinsic::AMDGPU_read_workdim: + case Intrinsic::amdgcn_read_workdim: + case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name. // Really only 2 bits. return lowerImplicitZextParam(DAG, Op, MVT::i8, getImplicitParameterOffset(MFI, GRID_DIM)); + case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); + case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); + case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); + case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); + case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); + case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); @@ -1336,24 +1792,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } - case AMDGPUIntrinsic::SI_sample: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); - case AMDGPUIntrinsic::SI_sampleb: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); - case AMDGPUIntrinsic::SI_sampled: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); - case AMDGPUIntrinsic::SI_samplel: - return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case AMDGPUIntrinsic::AMDGPU_fract: - case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name. - return DAG.getNode(ISD::FSUB, DL, VT, Op.getOperand(1), - DAG.getNode(ISD::FFLOOR, DL, VT, Op.getOperand(1))); case AMDGPUIntrinsic::SI_fs_constant: { SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); SDValue Glue = M0.getValue(1); @@ -1393,11 +1837,93 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_sin: + return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_cos: + return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_log_clamp: { + if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + return SDValue(); + + DiagnosticInfoUnsupported BadIntrin( + *MF.getFunction(), "intrinsic not supported on subtarget", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); + } + case Intrinsic::amdgcn_ldexp: + return DAG.getNode(AMDGPUISD::LDEXP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::amdgcn_fract: + return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); + + case Intrinsic::amdgcn_class: + return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), + Op.getOperand(4)); + + case Intrinsic::amdgcn_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::amdgcn_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_div_scale: { + // 3rd parameter required to be a constant. + const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); + if (!Param) + return DAG.getUNDEF(VT); + + // Translate to the operands expected by the machine instruction. The + // first parameter must be the same as the first instruction. + SDValue Numerator = Op.getOperand(1); + SDValue Denominator = Op.getOperand(2); + + // Note this order is opposite of the machine instruction's operations, + // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The + // intrinsic has the numerator as the first operand to match a normal + // division operation. + + SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, + Denominator, Numerator); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } } +SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + MemSDNode *M = cast<MemSDNode>(Op); + unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? + AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: + return SDValue(); + } +} + SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -1439,6 +1965,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, Op->getVTList(), Ops, VT, MMO); } + case AMDGPUIntrinsic::AMDGPU_kill: { + if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Op.getOperand(2))) { + if (!K->isNegative()) + return Chain; + } + + return Op; + } default: return SDValue(); } @@ -1447,48 +1981,92 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); LoadSDNode *Load = cast<LoadSDNode>(Op); + ISD::LoadExtType ExtType = Load->getExtensionType(); + EVT MemVT = Load->getMemoryVT(); - if (Op.getValueType().isVector()) { - assert(Op.getValueType().getVectorElementType() == MVT::i32 && - "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned NumElements = Op.getValueType().getVectorNumElements(); - assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { + assert(MemVT == MVT::i1 && "Only i1 non-extloads expected"); + // FIXME: Copied from PPC + // First, load into 32 bits, then truncate to 1 bit. - switch (Load->getAddressSpace()) { - default: break; - case AMDGPUAS::CONSTANT_ADDRESS: - if (isMemOpUniform(Load)) - break; - // Non-uniform loads will be selected to MUBUF instructions, so they - // have the same legalization requires ments as global and private - // loads. - // - // Fall-through - case AMDGPUAS::GLOBAL_ADDRESS: - case AMDGPUAS::PRIVATE_ADDRESS: - if (NumElements >= 8) - return SplitVectorLoad(Op, DAG); - - // v4 loads are supported for private and global memory. - if (NumElements <= 4) - break; - // fall-through - case AMDGPUAS::LOCAL_ADDRESS: - // If properly aligned, if we split we might be able to use ds_read_b64. + SDValue Chain = Load->getChain(); + SDValue BasePtr = Load->getBasePtr(); + MachineMemOperand *MMO = Load->getMemOperand(); + + SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, + BasePtr, MVT::i8, MMO); + + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + if (!MemVT.isVector()) + return SDValue(); + + assert(Op.getValueType().getVectorElementType() == MVT::i32 && + "Custom lowering for non-i32 vectors hasn't been implemented."); + + unsigned AS = Load->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, + AS, Load->getAlignment())) { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } + + unsigned NumElements = MemVT.getVectorNumElements(); + switch (AS) { + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + return SDValue(); + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) + return SplitVectorLoad(Op, DAG); + // v4 loads are supported for private and global memory. + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + // Depending on the setting of the private_element_size field in the + // resource descriptor, we can only make private accesses up to a certain + // size. + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return scalarizeVectorLoad(Load, DAG); + case 8: + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); + return SDValue(); + case 16: + // Same as global/flat + if (NumElements > 4) return SplitVectorLoad(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); } } + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) + return SplitVectorLoad(Op, DAG); - return AMDGPUTargetLowering::LowerLOAD(Op, DAG); -} + if (NumElements == 2) + return SDValue(); -SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, - const SDValue &Op, - SelectionDAG &DAG) const { - return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), - Op.getOperand(2), - Op.getOperand(3), - Op.getOperand(4)); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); + } + default: + return SDValue(); + } } SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { @@ -1514,7 +2092,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1); - SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi); + SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi}); return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res); } @@ -1547,7 +2125,9 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { } } - if (Unsafe) { + const SDNodeFlags *Flags = Op->getFlags(); + + if (Unsafe || Flags->hasAllowReciprocal()) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) SDNodeFlags Flags; @@ -1560,45 +2140,71 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { } SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - SDValue FastLowered = LowerFastFDIV(Op, DAG); - if (FastLowered.getNode()) + if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) return FastLowered; - // This uses v_rcp_f32 which does not handle denormals. Let this hit a - // selection error for now rather than do something incorrect. - if (Subtarget->hasFP32Denormals()) - return SDValue(); - SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag + if (EnableAMDGPUFastFDIV) { + // This does not support denormals. + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + + // TODO: Should this propagate fast-math-flags? - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + // rcp does not support denormals. + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + } + + // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, RHS, RHS, LHS); + SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, LHS, RHS, LHS); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + // Denominator is scaled to not be denormal, so using rcp is ok. + SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled); - // TODO: Should this propagate fast-math-flags? + SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, ApproxRcp, One); + SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, ApproxRcp); - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled); + SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul); + SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled); - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); + SDValue Scale = NumeratorScaled.getValue(1); + SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, Fma4, Fma1, Fma3, Scale); + + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { @@ -1635,7 +2241,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -1685,26 +2291,57 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { StoreSDNode *Store = cast<StoreSDNode>(Op); EVT VT = Store->getMemoryVT(); - // These stores are legal. - if (Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - if (VT.isVector() && VT.getVectorNumElements() > 4) - return ScalarizeVectorStore(Op, DAG); - return SDValue(); + if (VT == MVT::i1) { + return DAG.getTruncStore(Store->getChain(), DL, + DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), + Store->getBasePtr(), MVT::i1, Store->getMemOperand()); } - SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); - if (Ret.getNode()) - return Ret; + assert(VT.isVector() && + Store->getValue().getValueType().getScalarType() == MVT::i32); - if (VT.isVector() && VT.getVectorNumElements() >= 8) + unsigned AS = Store->getAddressSpace(); + if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AS, Store->getAlignment())) { + return expandUnalignedStore(Store, DAG); + } + + unsigned NumElements = VT.getVectorNumElements(); + switch (AS) { + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::FLAT_ADDRESS: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + case AMDGPUAS::PRIVATE_ADDRESS: { + switch (Subtarget->getMaxPrivateElementSize()) { + case 4: + return scalarizeVectorStore(Store, DAG); + case 8: + if (NumElements > 2) + return SplitVectorStore(Op, DAG); + return SDValue(); + case 16: + if (NumElements > 4) + return SplitVectorStore(Op, DAG); + return SDValue(); + default: + llvm_unreachable("unsupported private_element_size"); + } + } + case AMDGPUAS::LOCAL_ADDRESS: { + if (NumElements > 2) return SplitVectorStore(Op, DAG); - if (VT == MVT::i1) - return DAG.getTruncStore(Store->getChain(), DL, - DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), - Store->getBasePtr(), MVT::i1, Store->getMemOperand()); + if (NumElements == 2) + return Op; - return SDValue(); + // If properly aligned, if we split we might be able to use ds_write_b64. + return SplitVectorStore(Op, DAG); + } + default: + llvm_unreachable("unhandled address space"); + } } SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { @@ -1727,6 +2364,33 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { } } +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op); + assert(AtomicNode->isCompareAndSwap()); + unsigned AS = AtomicNode->getAddressSpace(); + + // No custom lowering required for local address space + if (!isFlatGlobalAddrSpace(AS)) + return Op; + + // Non-local address space requires custom lowering for atomic compare + // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 + SDLoc DL(Op); + SDValue ChainIn = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + SDValue Old = Op.getOperand(2); + SDValue New = Op.getOperand(3); + EVT VT = Op.getValueType(); + MVT SimpleVT = VT.getSimpleVT(); + MVT VecType = MVT::getVectorVT(SimpleVT, 2); + + SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old}); + SDValue Ops[] = { ChainIn, Addr, NewOld }; + + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), + Ops, VT, AtomicNode->getMemOperand()); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -1756,88 +2420,13 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, } } - // We are primarily trying to catch operations on illegal vector types - // before they are expanded. - // For scalars, we can use the more flexible method of checking masked bits - // after legalization. - if (!DCI.isBeforeLegalize() || - !SrcVT.isVector() || - SrcVT.getVectorElementType() != MVT::i8) { - return SDValue(); - } - - assert(DCI.isBeforeLegalize() && "Unexpected legal type"); - - // Weird sized vectors are a pain to handle, but we know 3 is really the same - // size as 4. - unsigned NElts = SrcVT.getVectorNumElements(); - if (!SrcVT.isSimple() && NElts != 3) - return SDValue(); - - // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to - // prevent a mess from expanding to v4i32 and repacking. - if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { - EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT); - EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT); - EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts); - LoadSDNode *Load = cast<LoadSDNode>(Src); - - unsigned AS = Load->getAddressSpace(); - unsigned Align = Load->getAlignment(); - Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext()); - unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty); - - // Don't try to replace the load if we have to expand it due to alignment - // problems. Otherwise we will end up scalarizing the load, and trying to - // repack into the vector for no real reason. - if (Align < ABIAlignment && - !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) { - return SDValue(); - } - - SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT, - Load->getChain(), - Load->getBasePtr(), - LoadVT, - Load->getMemOperand()); - - // Make sure successors of the original load stay after it by updating - // them to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1)); - - SmallVector<SDValue, 4> Elts; - if (RegVT.isVector()) - DAG.ExtractVectorElements(NewLoad, Elts); - else - Elts.push_back(NewLoad); - - SmallVector<SDValue, 4> Ops; - - unsigned EltIdx = 0; - for (SDValue Elt : Elts) { - unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx); - for (unsigned I = 0; I < ComponentsInElt; ++I) { - unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I; - SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt); - DCI.AddToWorklist(Cvt.getNode()); - Ops.push_back(Cvt); - } - - ++EltIdx; - } - - assert(Ops.size() == NElts); - - return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops); - } - return SDValue(); } /// \brief Return true if the given offset Size in bytes can be folded into /// the immediate offsets of a memory instruction for the given address space. static bool canFoldOffset(unsigned OffsetSize, unsigned AS, - const AMDGPUSubtarget &STI) { + const SISubtarget &STI) { switch (AS) { case AMDGPUAS::GLOBAL_ADDRESS: { // MUBUF instructions a 12-bit offset in bytes. @@ -1846,7 +2435,7 @@ static bool canFoldOffset(unsigned OffsetSize, unsigned AS, case AMDGPUAS::CONSTANT_ADDRESS: { // SMRD instructions have an 8-bit offset in dwords on SI and // a 20-bit offset in bytes on VI. - if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) return isUInt<20>(OffsetSize); else return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4); @@ -1897,7 +2486,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, // If the resulting offset is too large, we can't fold it into the addressing // mode offset. APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); - if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget)) + if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget())) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -1915,6 +2504,9 @@ SDValue SITargetLowering::performAndCombine(SDNode *N, if (DCI.isBeforeLegalize()) return SDValue(); + if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI)) + return Base; + SelectionDAG &DAG = DCI.DAG; // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> @@ -1970,6 +2562,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + if (VT == MVT::i64) { + // TODO: This could be a generic combine with a predicate for extracting the + // high half of an integer being free. + + // (or i64:x, (zero_extend i32:y)) -> + // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x))) + if (LHS.getOpcode() == ISD::ZERO_EXTEND && + RHS.getOpcode() != ISD::ZERO_EXTEND) + std::swap(LHS, RHS); + + if (RHS.getOpcode() == ISD::ZERO_EXTEND) { + SDValue ExtSrc = RHS.getOperand(0); + EVT SrcVT = ExtSrc.getValueType(); + if (SrcVT == MVT::i32) { + SDLoc SL(N); + SDValue LowLHS, HiBits; + std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG); + SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc); + + DCI.AddToWorklist(LowOr.getNode()); + DCI.AddToWorklist(HiBits.getNode()); + + SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + LowOr, HiBits); + return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); + } + } + } + // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) if (LHS.getOpcode() == AMDGPUISD::FP_CLASS && RHS.getOpcode() == AMDGPUISD::FP_CLASS) { @@ -2005,9 +2627,52 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return DAG.getConstant(0, SDLoc(N), MVT::i1); } + if (N->getOperand(0).isUndef()) + return DAG.getUNDEF(MVT::i1); + return SDValue(); } +// Constant fold canonicalize. +SDValue SITargetLowering::performFCanonicalizeCombine( + SDNode *N, + DAGCombinerInfo &DCI) const { + ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); + if (!CFP) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + const APFloat &C = CFP->getValueAPF(); + + // Flush denormals to 0 if not enabled. + if (C.isDenormal()) { + EVT VT = N->getValueType(0); + if (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + + if (VT == MVT::f64 && !Subtarget->hasFP64Denormals()) + return DAG.getConstantFP(0.0, SDLoc(N), VT); + } + + if (C.isNaN()) { + EVT VT = N->getValueType(0); + APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics()); + if (C.isSignaling()) { + // Quiet a signaling NaN. + return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); + } + + // Make sure it is the canonical NaN bitpattern. + // + // TODO: Can we use -1 as the canonical NaN value since it's an inline + // immediate? + if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt()) + return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT); + } + + return SDValue(CFP, 0); +} + static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { switch (Opc) { case ISD::FMAXNUM: @@ -2027,8 +2692,64 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { } } -SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, - DAGCombinerInfo &DCI) const { +static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) { + ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1); + if (!K1) + return SDValue(); + + ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + if (Signed) { + if (K0->getAPIntValue().sge(K1->getAPIntValue())) + return SDValue(); + } else { + if (K0->getAPIntValue().uge(K1->getAPIntValue())) + return SDValue(); + } + + EVT VT = K0->getValueType(0); + return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); +} + +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1) { + ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1); + if (!K1) + return SDValue(); + + ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1)); + if (!K0) + return SDValue(); + + // Ordered >= (although NaN inputs should have folded away by now). + APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); + if (Cmp == APFloat::cmpGreaterThan) + return SDValue(); + + // This isn't safe with signaling NaNs because in IEEE mode, min/max on a + // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then + // give the other result, which is different from med3 with a NaN input. + SDValue Var = Op0.getOperand(0); + if (!isKnownNeverSNan(DAG, Var)) + return SDValue(); + + return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), + Var, SDValue(K0, 0), SDValue(K1, 0)); +} + +SDValue SITargetLowering::performMinMaxCombine(SDNode *N, + DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; unsigned Opc = N->getOpcode(); @@ -2038,26 +2759,51 @@ SDValue SITargetLowering::performMin3Max3Combine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - // max(max(a, b), c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + // max(max(a, b), c) -> max3(a, b, c) + // min(min(a, b), c) -> min3(a, b, c) + if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0.getOperand(0), + Op0.getOperand(1), + Op1); + } + + // Try commuted. + // max(a, max(b, c)) -> max3(a, b, c) + // min(a, min(b, c)) -> min3(a, b, c) + if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { + SDLoc DL(N); + return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), + DL, + N->getValueType(0), + Op0, + Op1.getOperand(0), + Op1.getOperand(1)); + } } - // max(a, max(b, c)) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); + // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + return Med3; + } + + if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + return Med3; + } + + // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) + if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || + (Opc == AMDGPUISD::FMIN_LEGACY && + Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && + N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) + return Res; } return SDValue(); @@ -2104,16 +2850,18 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); - case ISD::FMAXNUM: // TODO: What about fmax_legacy? + case ISD::FMAXNUM: case ISD::FMINNUM: case ISD::SMAX: case ISD::SMIN: case ISD::UMAX: - case ISD::UMIN: { + case ISD::UMIN: + case AMDGPUISD::FMIN_LEGACY: + case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) - return performMin3Max3Combine(N, DCI); + return performMinMaxCombine(N, DCI); break; } @@ -2122,8 +2870,23 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::CVT_F32_UBYTE2: case AMDGPUISD::CVT_F32_UBYTE3: { unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; - SDValue Src = N->getOperand(0); + + // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. + if (Src.getOpcode() == ISD::SRL) { + // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x + // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x + // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x + + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) { + unsigned SrcOffset = C->getZExtValue() + 8 * Offset; + if (SrcOffset < 32 && SrcOffset % 8 == 0) { + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL, + MVT::f32, Src.getOperand(0)); + } + } + } + APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); APInt KnownZero, KnownOne; @@ -2238,7 +3001,9 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + case ISD::ATOMIC_LOAD_UMAX: + case AMDGPUISD::ATOMIC_INC: + case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; @@ -2264,6 +3029,19 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performOrCombine(N, DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); + case ISD::FCANONICALIZE: + return performFCanonicalizeCombine(N, DCI); + case AMDGPUISD::FRACT: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RSQ_CLAMP: + case AMDGPUISD::LDEXP: { + SDValue Src = N->getOperand(0); + if (Src.isUndef()) + return Src; + break; + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } @@ -2273,9 +3051,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate /// and the immediate value if it's a literal immediate int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { - - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { if (TII->isInlineConstant(Node->getAPIntValue())) @@ -2314,7 +3090,8 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, SelectionDAG &DAG) const { SDNode *Users[4] = { }; unsigned Lane = 0; - unsigned OldDmask = Node->getConstantOperandVal(0); + unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3; + unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); unsigned NewDmask = 0; // Try to figure out the used register components @@ -2354,8 +3131,9 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Adjust the writemask in the node std::vector<SDValue> Ops; + Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32)); - Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end()); + Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops); // If we only got one lane, replace it with a copy @@ -2421,14 +3199,15 @@ void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, /// \brief Fold the instructions after selecting them. SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + unsigned Opcode = Node->getMachineOpcode(); - if (TII->isMIMG(Node->getMachineOpcode())) + if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() && + !TII->isGather4(Opcode)) adjustWritemask(Node, DAG); - if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG || - Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) { + if (Opcode == AMDGPU::INSERT_SUBREG || + Opcode == AMDGPU::REG_SEQUENCE) { legalizeTargetIndependentNode(Node, DAG); return Node; } @@ -2437,22 +3216,22 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, /// \brief Assign the register class depending on the number of /// bits set in the writemask -void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, +void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - if (TII->isVOP3(MI->getOpcode())) { + if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); return; } - if (TII->isMIMG(*MI)) { - unsigned VReg = MI->getOperand(0).getReg(); - unsigned Writemask = MI->getOperand(1).getImm(); + if (TII->isMIMG(MI)) { + unsigned VReg = MI.getOperand(0).getReg(); + unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4; + unsigned Writemask = MI.getOperand(DmaskIdx).getImm(); unsigned BitsSet = 0; for (unsigned i = 0; i < 4; ++i) BitsSet += Writemask & (1 << i) ? 1 : 0; @@ -2465,34 +3244,58 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, case 3: RC = &AMDGPU::VReg_96RegClass; break; } - unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); - MI->setDesc(TII->get(NewOpcode)); + unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet); + MI.setDesc(TII->get(NewOpcode)); MRI.setRegClass(VReg, RC); return; } // Replace unused atomics with the no return version. - int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode()); + int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode()); if (NoRetAtomicOp != -1) { if (!Node->hasAnyUseOfValue(0)) { - MI->setDesc(TII->get(NoRetAtomicOp)); - MI->RemoveOperand(0); + MI.setDesc(TII->get(NoRetAtomicOp)); + MI.RemoveOperand(0); + return; } + // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg + // instruction, because the return type of these instructions is a vec2 of + // the memory type, so it can be tied to the input operand. + // This means these instructions always have a use, so we need to add a + // special case to check if the atomic has only one extract_subreg use, + // which itself has no uses. + if ((Node->hasNUsesOfValue(1, 0) && + Node->use_begin()->isMachineOpcode() && + Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && + !Node->use_begin()->hasAnyUseOfValue(0))) { + unsigned Def = MI.getOperand(0).getReg(); + + // Change this into a noret atomic. + MI.setDesc(TII->get(NoRetAtomicOp)); + MI.RemoveOperand(0); + + // If we only remove the def operand from the atomic instruction, the + // extract_subreg will be left with a use of a vreg without a def. + // So we need to insert an implicit_def to avoid machine verifier + // errors. + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Def); + } return; } } -static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) { +static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, + uint64_t Val) { SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32); return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0); } MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, - SDLoc DL, + const SDLoc &DL, SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); // Build the half of the subregister with the constants before building the // full 128-bit register. If we are building multiple resource descriptors, @@ -2524,10 +3327,8 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, /// The TID (Thread ID) is multiplied by the stride value (bits [61:48] /// of the resource descriptor) to create an offset, which is added to /// the resource pointer. -MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, +MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, + SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const { SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr); SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index f01b2c0d09f3..8e055eea58c2 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -12,26 +12,26 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_SIISELLOWERING_H -#define LLVM_LIB_TARGET_R600_SIISELLOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H #include "AMDGPUISelLowering.h" #include "SIInstrInfo.h" namespace llvm { -class SITargetLowering : public AMDGPUTargetLowering { - SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL, +class SITargetLowering final : public AMDGPUTargetLowering { + SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, + unsigned Offset) const; + SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, unsigned Offset, bool Signed) const; - SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op, - SelectionDAG &DAG) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; - SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT, unsigned Offset) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; @@ -43,8 +43,13 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const; + SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const; + void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; SDValue performUCharToFloatCombine(SDNode *N, @@ -55,14 +60,25 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; + + SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; - SDValue performMin3Max3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; + + bool isCFIntrinsic(const SDNode *Intr) const; + + void createDebuggerPrologueStackObjects(MachineFunction &MF) const; public: - SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); + SITargetLowering(const TargetMachine &tm, const SISubtarget &STI); + + const SISubtarget *getSubtarget() const; + + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, + unsigned IntrinsicID) const override; bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/, EVT /*VT*/) const override; @@ -89,21 +105,30 @@ public: bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; + bool isTypeDesirableForOp(unsigned Op, EVT VT) const override; + + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc DL, SelectionDAG &DAG, + const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; - SDValue LowerReturn(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - SDLoc DL, SelectionDAG &DAG) const override; + const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, + SelectionDAG &DAG) const override; + + unsigned getRegisterByName(const char* RegName, EVT VT, + SelectionDAG &DAG) const override; + + MachineBasicBlock *splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const; - MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, - MachineBasicBlock * BB) const override; + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr &MI, + MachineBasicBlock *BB) const override; bool enableAggressiveFMAFusion(EVT VT) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -112,7 +137,7 @@ public: SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; - void AdjustInstrPostInstrSelection(MachineInstr *MI, + void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; int32_t analyzeImmediate(const SDNode *N) const; @@ -120,17 +145,16 @@ public: unsigned Reg, EVT VT) const override; void legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const; - MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const; - MachineSDNode *buildRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr, - uint32_t RsrcDword1, - uint64_t RsrcDword2And3) const; + MachineSDNode *wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, + SDValue Ptr) const; + MachineSDNode *buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, + uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; ConstraintType getConstraintType(StringRef Constraint) const override; - SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; + SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, + SDValue V) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp index 94e614750d2f..d24588d6c143 100644 --- a/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -26,6 +26,8 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#define DEBUG_TYPE "si-insert-waits" + using namespace llvm; namespace { @@ -53,7 +55,7 @@ typedef std::pair<unsigned, unsigned> RegInterval; class SIInsertWaits : public MachineFunctionPass { private: - static char ID; + const SISubtarget *ST; const SIInstrInfo *TII; const SIRegisterInfo *TRI; const MachineRegisterInfo *MRI; @@ -67,6 +69,10 @@ private: /// \brief Counter values we have already waited on. Counters WaitedOn; + /// \brief Counter values that we must wait on before the next counter + /// increase. + Counters DelayedWaitOn; + /// \brief Counter values for last instruction issued. Counters LastIssued; @@ -87,6 +93,9 @@ private: /// \brief Whether the machine function returns void bool ReturnsVoid; + /// Whether the VCCZ bit is possibly corrupt + bool VCCZCorrupt; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -99,13 +108,17 @@ private: /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I); + MachineBasicBlock::iterator I, + const Counters& Increment); /// \brief Insert the actual wait instruction bool insertWait(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const Counters &Counts); + /// \brief Handle existing wait instructions (from intrinsics) + void handleExistingWait(MachineBasicBlock::iterator I); + /// \brief Do we need def2def checks? bool unorderedDefines(MachineInstr &MI); @@ -115,12 +128,20 @@ private: /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG. void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + /// Return true if there are LGKM instrucitons that haven't been waited on + /// yet. + bool hasOutstandingLGKM() const; + public: - SIInsertWaits(TargetMachine &tm) : + static char ID; + + SIInsertWaits() : MachineFunctionPass(ID), + ST(nullptr), TII(nullptr), TRI(nullptr), - ExpInstrTypesSeen(0) { } + ExpInstrTypesSeen(0), + VCCZCorrupt(false) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -136,13 +157,28 @@ public: } // End anonymous namespace +INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE, + "SI Insert Waits", false, false) +INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE, + "SI Insert Waits", false, false) + char SIInsertWaits::ID = 0; -const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } }; +char &llvm::SIInsertWaitsID = SIInsertWaits::ID; + +FunctionPass *llvm::createSIInsertWaitsPass() { + return new SIInsertWaits(); +} + +const Counters SIInsertWaits::WaitCounts = { { 15, 7, 15 } }; const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } }; -FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { - return new SIInsertWaits(tm); +static bool readsVCCZ(unsigned Opcode) { + return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ; +} + +bool SIInsertWaits::hasOutstandingLGKM() const { + return WaitedOn.Named.LGKM != LastIssued.Named.LGKM; } Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { @@ -205,24 +241,23 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { return false; // Check if this operand is the value being stored. - // Special case for DS instructions, since the address + // Special case for DS/FLAT instructions, since the address // operand comes before the value operand and it may have // multiple data operands. - if (TII->isDS(MI)) { + if (TII->isDS(MI) || TII->isFLAT(MI)) { MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); if (Data && Op.isIdenticalTo(*Data)) return true; + } + if (TII->isDS(MI)) { MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0); if (Data0 && Op.isIdenticalTo(*Data0)) return true; MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1); - if (Data1 && Op.isIdenticalTo(*Data1)) - return true; - - return false; + return Data1 && Op.isIdenticalTo(*Data1); } // NOTE: This assumes that the value operand is before the @@ -250,10 +285,10 @@ RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, } void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { + MachineBasicBlock::iterator I, + const Counters &Increment) { // Get the hardware counter increments and sum them up - Counters Increment = getHwCounts(*I); Counters Limit = ZeroCounts; unsigned Sum = 0; @@ -270,8 +305,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, return; } - if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // @@ -281,8 +315,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // and destination registers don't overlap, e.g. this is illegal: // r0 = load r2 // r2 = load r0 - if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) || - (LastOpcodeType == VMEM && Increment.Named.VM)) { + if (LastOpcodeType == VMEM && Increment.Named.VM) { // Insert a NOP to break the clause. BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) .addImm(0); @@ -379,7 +412,7 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB, BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm((Counts.Named.VM & 0xF) | ((Counts.Named.EXP & 0x7) << 4) | - ((Counts.Named.LGKM & 0x7) << 8)); + ((Counts.Named.LGKM & 0xF) << 8)); LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -393,16 +426,38 @@ static void increaseCounters(Counters &Dst, const Counters &Src) { Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]); } +/// \brief check whether any of the counters is non-zero +static bool countersNonZero(const Counters &Counter) { + for (unsigned i = 0; i < 3; ++i) + if (Counter.Array[i]) + return true; + return false; +} + +void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) { + assert(I->getOpcode() == AMDGPU::S_WAITCNT); + + unsigned Imm = I->getOperand(0).getImm(); + Counters Counts, WaitOn; + + Counts.Named.VM = Imm & 0xF; + Counts.Named.EXP = (Imm >> 4) & 0x7; + Counts.Named.LGKM = (Imm >> 8) & 0xF; + + for (unsigned i = 0; i < 3; ++i) { + if (Counts.Array[i] <= LastIssued.Array[i]) + WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i]; + else + WaitOn.Array[i] = 0; + } + + increaseCounters(DelayedWaitOn, WaitOn); +} + Counters SIInsertWaits::handleOperands(MachineInstr &MI) { Counters Result = ZeroCounts; - // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, - // but we also want to wait for any other outstanding transfers before - // signalling other hardware blocks - if (MI.getOpcode() == AMDGPU::S_SENDMSG) - return LastIssued; - // For each register affected by this instruction increase the result // sequence. // @@ -432,8 +487,7 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() < - AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return; // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. @@ -460,13 +514,13 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; - TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); - + ST = &MF.getSubtarget<SISubtarget>(); + TII = ST->getInstrInfo(); + TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); WaitedOn = ZeroCounts; + DelayedWaitOn = ZeroCounts; LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; @@ -475,6 +529,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); + SmallVector<MachineInstr *, 4> RemoveMI; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -482,27 +538,81 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { + if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { + // There is a hardware bug on CI/SI where SMRD instruction may corrupt + // vccz bit, so when we detect that an instruction may read from a + // corrupt vccz bit, we need to: + // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to + // complete. + // 2. Restore the correct value of vccz by writing the current value + // of vcc back to vcc. + + if (TII->isSMRD(I->getOpcode())) { + VCCZCorrupt = true; + } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) { + // FIXME: We only care about SMRD instructions here, not LDS or GDS. + // Whenever we store a value in vcc, the correct value of vccz is + // restored. + VCCZCorrupt = false; + } + + // Check if we need to apply the bug work-around + if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) { + DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n'); + + // Wait on everything, not just LGKM. vccz reads usually come from + // terminators, and we always wait on everything at the end of the + // block, so if we only wait on LGKM here, we might end up with + // another s_waitcnt inserted right after this if there are non-LGKM + // instructions still outstanding. + insertWait(MBB, I, LastIssued); + + // Restore the vccz bit. Any time a value is written to vcc, the vcc + // bit is updated, so we can restore the bit by reading the value of + // vcc and then writing it back to the register. + BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::VCC) + .addReg(AMDGPU::VCC); + } + } + + // Record pre-existing, explicitly requested waits + if (I->getOpcode() == AMDGPU::S_WAITCNT) { + handleExistingWait(*I); + RemoveMI.push_back(&*I); + continue; + } + + Counters Required; + // Wait for everything before a barrier. - if (I->getOpcode() == AMDGPU::S_BARRIER) - Changes |= insertWait(MBB, I, LastIssued); + // + // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish, + // but we also want to wait for any other outstanding transfers before + // signalling other hardware blocks + if (I->getOpcode() == AMDGPU::S_BARRIER || + I->getOpcode() == AMDGPU::S_SENDMSG) + Required = LastIssued; else - Changes |= insertWait(MBB, I, handleOperands(*I)); + Required = handleOperands(*I); + + Counters Increment = getHwCounts(*I); - pushInstruction(MBB, I); + if (countersNonZero(Required) || countersNonZero(Increment)) + increaseCounters(Required, DelayedWaitOn); + + Changes |= insertWait(MBB, I, Required); + + pushInstruction(MBB, I, Increment); handleSendMsg(MBB, I); } // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); - - // Functions returning something shouldn't contain S_ENDPGM, because other - // bytecode will be appended after it. - if (!ReturnsVoid) { - MachineBasicBlock::iterator I = MBB.getFirstTerminator(); - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) - I->eraseFromParent(); - } } + for (MachineInstr *I : RemoveMI) + I->eraseFromParent(); + return Changes; } diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td index 0e883f64caa3..2f63d4ed13b3 100644 --- a/lib/Target/AMDGPU/SIInstrFormats.td +++ b/lib/Target/AMDGPU/SIInstrFormats.td @@ -11,8 +11,9 @@ // //===----------------------------------------------------------------------===// -class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : - AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { +class InstSI <dag outs, dag ins, string asm = "", + list<dag> pattern = []> : + AMDGPUInst<outs, ins, asm, pattern>, PredicateControl { field bits<1> VM_CNT = 0; field bits<1> EXP_CNT = 0; @@ -31,6 +32,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> VOP2 = 0; field bits<1> VOP3 = 0; field bits<1> VOPC = 0; + field bits<1> SDWA = 0; + field bits<1> DPP = 0; field bits<1> MUBUF = 0; field bits<1> MTBUF = 0; @@ -45,6 +48,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : // is unable to infer the encoding from the operands. field bits<1> VOPAsmPrefer32Bit = 0; + field bits<1> Gather4 = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -63,18 +68,33 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : let TSFlags{11} = VOP2; let TSFlags{12} = VOP3; let TSFlags{13} = VOPC; - - let TSFlags{14} = MUBUF; - let TSFlags{15} = MTBUF; - let TSFlags{16} = SMRD; - let TSFlags{17} = DS; - let TSFlags{18} = MIMG; - let TSFlags{19} = FLAT; - let TSFlags{20} = WQM; - let TSFlags{21} = VGPRSpill; - let TSFlags{22} = VOPAsmPrefer32Bit; + let TSFlags{14} = SDWA; + let TSFlags{15} = DPP; + + let TSFlags{16} = MUBUF; + let TSFlags{17} = MTBUF; + let TSFlags{18} = SMRD; + let TSFlags{19} = DS; + let TSFlags{20} = MIMG; + let TSFlags{21} = FLAT; + let TSFlags{22} = WQM; + let TSFlags{23} = VGPRSpill; + let TSFlags{24} = VOPAsmPrefer32Bit; + let TSFlags{25} = Gather4; let SchedRW = [Write32Bit]; + + field bits<1> DisableSIDecoder = 0; + field bits<1> DisableVIDecoder = 0; + field bits<1> DisableDecoder = 0; + + let isAsmParserOnly = !if(!eq(DisableDecoder{0}, {0}), 0, 1); +} + +class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []> + : InstSI<outs, ins, "", pattern> { + let isPseudo = 1; + let isCodeGenOnly = 1; } class Enc32 { @@ -123,8 +143,10 @@ class VOP2Common <dag outs, dag ins, string asm, list<dag> pattern> : let Size = 4; } -class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <outs, ins, asm, pattern> { +class VOP3Common <dag outs, dag ins, string asm = "", + list<dag> pattern = [], bit HasMods = 0, + bit VOP3Only = 0> : + VOPAnyCommon <outs, ins, asm, pattern> { // Using complex patterns gives VOP3 patterns a very high complexity rating, // but standalone patterns are almost always prefered, so we need to adjust the @@ -135,7 +157,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let VOP3 = 1; let VALU = 1; - let AsmMatchConverter = "cvtVOP3"; + let AsmMatchConverter = + !if(!eq(VOP3Only,1), + "cvtVOP3", + !if(!eq(HasMods,1), "cvtVOP3_2_mod", "")); + let isCodeGenOnly = 0; int Size = 8; @@ -154,9 +180,9 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : class SOP1e <bits<8> op> : Enc32 { bits<7> sdst; - bits<8> ssrc0; + bits<8> src0; - let Inst{7-0} = ssrc0; + let Inst{7-0} = src0; let Inst{15-8} = op; let Inst{22-16} = sdst; let Inst{31-23} = 0x17d; //encoding; @@ -164,22 +190,22 @@ class SOP1e <bits<8> op> : Enc32 { class SOP2e <bits<7> op> : Enc32 { bits<7> sdst; - bits<8> ssrc0; - bits<8> ssrc1; + bits<8> src0; + bits<8> src1; - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; + let Inst{7-0} = src0; + let Inst{15-8} = src1; let Inst{22-16} = sdst; let Inst{29-23} = op; let Inst{31-30} = 0x2; // encoding } class SOPCe <bits<7> op> : Enc32 { - bits<8> ssrc0; - bits<8> ssrc1; + bits<8> src0; + bits<8> src1; - let Inst{7-0} = ssrc0; - let Inst{15-8} = ssrc1; + let Inst{7-0} = src0; + let Inst{15-8} = src1; let Inst{22-16} = op; let Inst{31-23} = 0x17e; } @@ -218,9 +244,7 @@ class SOPPe <bits<7> op> : Enc32 { class SMRDe <bits<5> op, bits<1> imm> : Enc32 { bits<7> sdst; bits<7> sbase; - bits<8> offset; - let Inst{7-0} = offset; let Inst{8} = imm; let Inst{14-9} = sbase{6-1}; let Inst{21-15} = sdst; @@ -228,6 +252,18 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } +class SMRD_IMMe <bits<5> op> : SMRDe<op, 1> { + bits<8> offset; + let Inst{7-0} = offset; +} + +class SMRD_SOFFe <bits<5> op> : SMRDe<op, 0> { + bits<8> soff; + let Inst{7-0} = soff; +} + + + class SMRD_IMMe_ci <bits<5> op> : Enc64 { bits<7> sdst; bits<7> sbase; @@ -348,19 +384,18 @@ class VOP2_MADKe <bits<6> op> : Enc64 { bits<8> vdst; bits<9> src0; - bits<8> vsrc1; - bits<32> src2; + bits<8> src1; + bits<32> imm; let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; + let Inst{16-9} = src1; let Inst{24-17} = vdst; let Inst{30-25} = op; let Inst{31} = 0x0; // encoding - let Inst{63-32} = src2; + let Inst{63-32} = imm; } -class VOP3e <bits<9> op> : Enc64 { - bits<8> vdst; +class VOP3a <bits<9> op> : Enc64 { bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -370,7 +405,6 @@ class VOP3e <bits<9> op> : Enc64 { bits<1> clamp; bits<2> omod; - let Inst{7-0} = vdst; let Inst{8} = src0_modifiers{1}; let Inst{9} = src1_modifiers{1}; let Inst{10} = src2_modifiers{1}; @@ -386,6 +420,20 @@ class VOP3e <bits<9> op> : Enc64 { let Inst{63} = src2_modifiers{0}; } +class VOP3e <bits<9> op> : VOP3a <op> { + bits<8> vdst; + + let Inst{7-0} = vdst; +} + +// Encoding used for VOPC instructions encoded as VOP3 +// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst +class VOP3ce <bits<9> op> : VOP3a <op> { + bits<8> sdst; + + let Inst{7-0} = sdst; +} + class VOP3be <bits<9> op> : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -412,10 +460,10 @@ class VOP3be <bits<9> op> : Enc64 { class VOPCe <bits<8> op> : Enc32 { bits<9> src0; - bits<8> vsrc1; + bits<8> src1; let Inst{8-0} = src0; - let Inst{16-9} = vsrc1; + let Inst{16-9} = src1; let Inst{24-17} = op; let Inst{31-25} = 0x3e; } @@ -675,17 +723,17 @@ class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let UseNamedOperandTable = 1; let hasSideEffects = 0; - let AsmMatchConverter = "cvtFlat"; let SchedRW = [WriteVMEM]; } -class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : - InstSI <outs, ins, asm, pattern>, MIMGe <op> { +class MIMG <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { let VM_CNT = 1; let EXP_CNT = 1; let MIMG = 1; let Uses = [EXEC]; + let UseNamedOperandTable = 1; let hasSideEffects = 0; // XXX ???? } diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 1e10d25e8fb7..d171e21c8a4f 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -12,14 +12,15 @@ // //===----------------------------------------------------------------------===// - #include "SIInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNHazardRecognizer.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" @@ -27,8 +28,8 @@ using namespace llvm; -SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} +SIInstrInfo::SIInstrInfo(const SISubtarget &ST) + : AMDGPUInstrInfo(ST), RI(), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -74,12 +75,12 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } -bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, +bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, AliasAnalysis *AA) const { // TODO: The generic check fails for VALU instructions that should be // rematerializable due to implicit reads of exec. We really want all of the // generic logic for this except for this. - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: @@ -201,18 +202,18 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, +bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const { - unsigned Opc = LdSt->getOpcode(); + unsigned Opc = LdSt.getOpcode(); - if (isDS(*LdSt)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + if (isDS(LdSt)) { + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); if (OffsetImm) { // Normal, single offset LDS instruction. - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); @@ -222,10 +223,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, // The 2 offset instructions use offset0 and offset1 instead. We can treat // these as a load with a single offset if the 2 offsets are consecutive. We // will use this for some partially aligned loads. - const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset0); - const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset1); + const MachineOperand *Offset0Imm = + getNamedOperand(LdSt, AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = + getNamedOperand(LdSt, AMDGPU::OpName::offset1); uint8_t Offset0 = Offset0Imm->getImm(); uint8_t Offset1 = Offset1Imm->getImm(); @@ -235,19 +236,19 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, // to bytes of the individual reads. unsigned EltSize; - if (LdSt->mayLoad()) - EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + if (LdSt.mayLoad()) + EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; else { - assert(LdSt->mayStore()); + assert(LdSt.mayStore()); int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); } if (isStride64(Opc)) EltSize *= 64; - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = EltSize * Offset0; return true; @@ -256,63 +257,91 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return false; } - if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { + if (isMUBUF(LdSt) || isMTBUF(LdSt)) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) return false; - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::vaddr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); return true; } - if (isSMRD(*LdSt)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + if (isSMRD(LdSt)) { + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); if (!OffsetImm) return false; - const MachineOperand *SBaseReg = getNamedOperand(*LdSt, - AMDGPU::OpName::sbase); + const MachineOperand *SBaseReg = + getNamedOperand(LdSt, AMDGPU::OpName::sbase); BaseReg = SBaseReg->getReg(); Offset = OffsetImm->getImm(); return true; } + if (isFLAT(LdSt)) { + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = 0; + return true; + } + return false; } -bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { - // TODO: This needs finer tuning - if (NumLoads > 4) +bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + MachineInstr &SecondLdSt, + unsigned NumLoads) const { + const MachineOperand *FirstDst = nullptr; + const MachineOperand *SecondDst = nullptr; + + if (isDS(FirstLdSt) && isDS(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); + } + + if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); + } + + if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + } + + if (!FirstDst || !SecondDst) return false; - if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) - return true; + // Try to limit clustering based on the total number of bytes loaded + // rather than the number of instructions. This is done to help reduce + // register pressure. The method used is somewhat inexact, though, + // because it assumes that all loads in the cluster will load the + // same number of bytes as FirstLdSt. - if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) - return true; + // The unit of this value is bytes. + // FIXME: This needs finer tuning. + unsigned LoadClusterThreshold = 16; - if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && - (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) - return true; + const MachineRegisterInfo &MRI = + FirstLdSt.getParent()->getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); - return false; + return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; } -void -SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { +void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { // If we are trying to copy to or from SCC, there is a bug somewhere else in // the backend. While it may be theoretically possible to do this, it should @@ -361,7 +390,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; ArrayRef<int16_t> SubIndices; - bool Forward; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); @@ -445,10 +473,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Can't copy register!"); } - if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) - Forward = true; - else - Forward = false; + bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { unsigned SubIdx; @@ -463,10 +488,12 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); if (Idx == SubIndices.size() - 1) - Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); + Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + + Builder.addReg(SrcReg, RegState::Implicit); } } @@ -525,6 +552,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V32_SAVE; case 8: return AMDGPU::SI_SPILL_V64_SAVE; + case 12: + return AMDGPU::SI_SPILL_V96_SAVE; case 16: return AMDGPU::SI_SPILL_V128_SAVE; case 32: @@ -558,19 +587,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); + if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { + // m0 may not be allowed for readlane. + MachineRegisterInfo &MRI = MF->getRegInfo(); + MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); + } + // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) // src + .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); @@ -585,10 +620,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) // src + .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } @@ -615,6 +651,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V32_RESTORE; case 8: return AMDGPU::SI_SPILL_V64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_V96_RESTORE; case 16: return AMDGPU::SI_SPILL_V128_RESTORE; case 32: @@ -648,6 +686,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); + + if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { + // m0 may not be allowed for readlane. + MachineRegisterInfo &MRI = MF->getRegInfo(); + MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); + } + BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); @@ -655,7 +700,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); @@ -671,20 +716,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled -unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, unsigned TmpReg, - unsigned FrameOffset, - unsigned Size) const { +unsigned SIInstrInfo::calculateLDSSpillAddress( + MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -699,8 +742,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (TIDReg == AMDGPU::NoRegister) return TIDReg; - - if (MFI->getShaderType() == ShaderType::COMPUTE && + if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && WorkGroupSize > WavefrontSize) { unsigned TIDIGXReg @@ -716,7 +758,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, Entry.addLiveIn(Reg); } - RS->enterBasicBlock(&Entry); + RS->enterBasicBlock(Entry); // FIXME: Can we scavenge an SReg_64 and access the subregs? unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); @@ -773,8 +815,10 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, return TmpReg; } -void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, +void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, int Count) const { + DebugLoc DL = MBB.findDebugLoc(MI); while (Count > 0) { int Arg; if (Count >= 8) @@ -782,76 +826,87 @@ void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, else Arg = Count - 1; Count -= 8; - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) .addImm(Arg); } } -bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - MachineBasicBlock &MBB = *MI->getParent(); +void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + insertWaitStates(MBB, MI, 1); +} + +unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: return 1; // FIXME: Do wait states equal cycles? + + case AMDGPU::S_NOP: + return MI.getOperand(0).getImm() + 1; + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::SGPR_USE: - // This is just a placeholder for register allocation. - MI->eraseFromParent(); - break; - case AMDGPU::V_MOV_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); if (SrcOp.isImm()) { APInt Imm(64, SrcOp.getImm()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Imm.getLoBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Imm.getHiBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); } else { assert(SrcOp.isReg()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit | RegState::Define); } - MI->eraseFromParent(); + MI.eraseFromParent(); break; } case AMDGPU::V_CNDMASK_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - unsigned Src0 = MI->getOperand(1).getReg(); - unsigned Src1 = MI->getOperand(2).getReg(); - const MachineOperand &SrcCond = MI->getOperand(3); + unsigned Src0 = MI.getOperand(1).getReg(); + unsigned Src1 = MI.getOperand(2).getReg(); + const MachineOperand &SrcCond = MI.getOperand(3); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) - .addOperand(SrcCond); + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addReg(SrcCond.getReg()) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) - .addOperand(SrcCond); - MI->eraseFromParent(); + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill())) + .addReg(Dst, RegState::Implicit | RegState::Define); + MI.eraseFromParent(); break; } - case AMDGPU::SI_CONSTDATA_PTR: { - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); + case AMDGPU::SI_PC_ADD_REL_OFFSET: { + const SIRegisterInfo *TRI + = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); MachineFunction &MF = *MBB.getParent(); - unsigned Reg = MI->getOperand(0).getReg(); + unsigned Reg = MI.getOperand(0).getReg(); unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); @@ -863,15 +918,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { // Add 32-bit offset from this instruction to the start of the // constant data. Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addOperand(MI->getOperand(1))); + .addReg(RegLo) + .addOperand(MI.getOperand(1))); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi) .addImm(0)); llvm::finalizeBundle(MBB, Bundler.begin()); - MI->eraseFromParent(); + MI.eraseFromParent(); break; } } @@ -885,22 +940,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { /// non-commutable pair of operand indices OpIdx0 and OpIdx1. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. -MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, - bool NewMI, +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const { - int CommutedOpcode = commuteOpcode(*MI); + int CommutedOpcode = commuteOpcode(MI); if (CommutedOpcode == -1) return nullptr; - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (!Src0.isReg()) return nullptr; - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || OpIdx1 != static_cast<unsigned>(Src1Idx)) && @@ -908,33 +962,32 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, OpIdx1 != static_cast<unsigned>(Src0Idx))) return nullptr; - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); - - if (isVOP2(*MI)) { - const MCInstrDesc &InstrDesc = MI->getDesc(); - // For VOP2 instructions, any operand type is valid to use for src0. Make - // sure we can use the src1 as src0. + if (isVOP2(MI) || isVOPC(MI)) { + const MCInstrDesc &InstrDesc = MI.getDesc(); + // For VOP2 and VOPC instructions, any operand type is valid to use for + // src0. Make sure we can use the src0 as src1. // // We could be stricter here and only allow commuting if there is a reason // to do so. i.e. if both operands are VGPRs there is no real benefit, // although MachineCSE attempts to find matches by commuting. - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) return nullptr; } + MachineInstr *CommutedMI = &MI; if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. - if (NewMI || !Src1.isImm() || - (!isVOP2(*MI) && !isVOP3(*MI))) { + if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) { return nullptr; } // Be sure to copy the source modifiers to the right place. - if (MachineOperand *Src0Mods - = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { - MachineOperand *Src1Mods - = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); + if (MachineOperand *Src0Mods = + getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods = + getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); int Src0ModsVal = Src0Mods->getImm(); if (!Src1Mods && Src0ModsVal != 0) @@ -959,26 +1012,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { - MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); + CommutedMI = + TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); } - if (MI) - MI->setDesc(get(CommutedOpcode)); + if (CommutedMI) + CommutedMI->setDesc(get(CommutedOpcode)); - return MI; + return CommutedMI; } // This needs to be implemented because the source modifiers may be inserted // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. -bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx0, +bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = MI.getDesc(); if (!MCID.isCommutable()) return false; - unsigned Opc = MI->getOpcode(); + unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; @@ -986,24 +1039,24 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on // immediate. Also, immediate src0 operand is not handled in // SIInstrInfo::commuteInstruction(); - if (!MI->getOperand(Src0Idx).isReg()) + if (!MI.getOperand(Src0Idx).isReg()) return false; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src1.isImm()) { // SIInstrInfo::commuteInstruction() does support commuting the immediate // operand src1 in 2 and 3 operand instructions. - if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode())) return false; } else if (Src1.isReg()) { // If any source modifiers are set, the generic instruction commuting won't // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) return false; } else return false; @@ -1011,23 +1064,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } -MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, - unsigned SrcReg) const { - return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), - DstReg) .addReg(SrcReg); +unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { + switch (Cond) { + case SIInstrInfo::SCC_TRUE: + return AMDGPU::S_CBRANCH_SCC1; + case SIInstrInfo::SCC_FALSE: + return AMDGPU::S_CBRANCH_SCC0; + case SIInstrInfo::VCCNZ: + return AMDGPU::S_CBRANCH_VCCNZ; + case SIInstrInfo::VCCZ: + return AMDGPU::S_CBRANCH_VCCZ; + case SIInstrInfo::EXECNZ: + return AMDGPU::S_CBRANCH_EXECNZ; + case SIInstrInfo::EXECZ: + return AMDGPU::S_CBRANCH_EXECZ; + default: + llvm_unreachable("invalid branch predicate"); + } +} + +SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_CBRANCH_SCC0: + return SCC_FALSE; + case AMDGPU::S_CBRANCH_SCC1: + return SCC_TRUE; + case AMDGPU::S_CBRANCH_VCCNZ: + return VCCNZ; + case AMDGPU::S_CBRANCH_VCCZ: + return VCCZ; + case AMDGPU::S_CBRANCH_EXECNZ: + return EXECNZ; + case AMDGPU::S_CBRANCH_EXECZ: + return EXECZ; + default: + return INVALID_BR; + } } -bool SIInstrInfo::isMov(unsigned Opcode) const { - switch(Opcode) { - default: return false; - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: +bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + if (I == MBB.end()) + return false; + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + // Unconditional Branch + TBB = I->getOperand(0).getMBB(); + return false; + } + + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) return true; + + MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + + ++I; + + if (I == MBB.end()) { + // Conditional branch followed by fall-through. + TBB = CondBB; + return false; + } + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + TBB = CondBB; + FBB = I->getOperand(0).getMBB(); + return false; + } + + return true; +} + +unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + unsigned Count = 0; + while (I != MBB.end()) { + MachineBasicBlock::iterator Next = std::next(I); + I->eraseFromParent(); + ++Count; + I = Next; + } + + return Count; +} + +unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const { + + if (!FBB && Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(TBB); + return 1; + } + + assert(TBB && Cond[0].isImm()); + + unsigned Opcode + = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); + + if (!FBB) { + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + return 1; } + + assert(TBB && FBB); + + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(FBB); + + return 2; +} + +bool SIInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 1); + Cond[0].setImm(-Cond[0].getImm()); + return false; } static void removeModOperands(MachineInstr &MI) { @@ -1044,81 +1209,76 @@ static void removeModOperands(MachineInstr &MI) { MI.RemoveOperand(Src0ModIdx); } -bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, +// TODO: Maybe this should be removed this and custom fold everything in +// SIFoldOperands? +bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; - unsigned Opc = UseMI->getOpcode(); + unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. - if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { return false; } - MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); - MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); - MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + const MachineOperand &ImmOp = DefMI.getOperand(1); + + // If this is a free constant, there's no reason to do this. + // TODO: We could fold this here instead of letting SIFoldOperands do it + // later. + if (isInlineConstant(ImmOp, 4)) + return false; + + MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_f32 // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; - if (!Src2->isReg() || - (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; - // We need to do some weird looking operand shuffling since the madmk - // operands are out of the normal expected order with the multiplied - // constant as the last operand. - // - // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 - // src0 -> src2 K - // src1 -> src0 - // src2 -> src1 + // We need to swap operands 0 and 1 since madmk constant is at operand 1. - const int64_t Imm = DefMI->getOperand(1).getImm(); + const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::clamp)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); unsigned Src1Reg = Src1->getReg(); unsigned Src1SubReg = Src1->getSubReg(); - unsigned Src2Reg = Src2->getReg(); - unsigned Src2SubReg = Src2->getSubReg(); Src0->setReg(Src1Reg); Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - Src1->setReg(Src2Reg); - Src1->setSubReg(Src2SubReg); - Src1->setIsKill(Src2->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { - UseMI->untieRegOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + UseMI.untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } - Src2->ChangeToImmediate(Imm); + Src1->ChangeToImmediate(Imm); - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + removeModOperands(UseMI); + UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) - DefMI->eraseFromParent(); + DefMI.eraseFromParent(); return true; } @@ -1131,36 +1291,35 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) return false; - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; - const int64_t Imm = DefMI->getOperand(1).getImm(); + const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::clamp)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); if (Opc == AMDGPU::V_MAC_F32_e64) { - UseMI->untieRegOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + UseMI.untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + removeModOperands(UseMI); + UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) - DefMI->eraseFromParent(); + DefMI.eraseFromParent(); return true; } @@ -1177,17 +1336,20 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, return LowOffset + LowWidth <= HighOffset; } -bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const { - unsigned BaseReg0, Offset0; - unsigned BaseReg1, Offset1; +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, + MachineInstr &MIb) const { + unsigned BaseReg0, BaseReg1; + int64_t Offset0, Offset1; if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { - assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && - "read2 / write2 not expected here yet"); - unsigned Width0 = (*MIa->memoperands_begin())->getSize(); - unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + + if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { + // FIXME: Handle ds_read2 / ds_write2. + return false; + } + unsigned Width0 = (*MIa.memoperands_begin())->getSize(); + unsigned Width1 = (*MIb.memoperands_begin())->getSize(); if (BaseReg0 == BaseReg1 && offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { return true; @@ -1197,19 +1359,19 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, return false; } -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, + MachineInstr &MIb, AliasAnalysis *AA) const { - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); - assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + assert((MIb.mayLoad() || MIb.mayStore()) && "MIb must load from or modify a memory location"); - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) return false; // XXX - Can we relax this between address spaces? - if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // TODO: Should we check the address space from the MachineMemOperand? That @@ -1217,29 +1379,29 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, // underlying address space, even if it was lowered to a different one, // e.g. private accesses lowered to use MUBUF instructions on a scratch // buffer. - if (isDS(*MIa)) { - if (isDS(*MIb)) + if (isDS(MIa)) { + if (isDS(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb); + return !isFLAT(MIb); } - if (isMUBUF(*MIa) || isMTBUF(*MIa)) { - if (isMUBUF(*MIb) || isMTBUF(*MIb)) + if (isMUBUF(MIa) || isMTBUF(MIa)) { + if (isMUBUF(MIb) || isMTBUF(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb) && !isSMRD(*MIb); + return !isFLAT(MIb) && !isSMRD(MIb); } - if (isSMRD(*MIa)) { - if (isSMRD(*MIb)) + if (isSMRD(MIa)) { + if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); + return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); } - if (isFLAT(*MIa)) { - if (isFLAT(*MIb)) + if (isFLAT(MIa)) { + if (isFLAT(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return false; @@ -1249,35 +1411,49 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, } MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, - MachineBasicBlock::iterator &MI, - LiveVariables *LV) const { - - switch (MI->getOpcode()) { - default: return nullptr; - case AMDGPU::V_MAC_F32_e64: break; - case AMDGPU::V_MAC_F32_e32: { - const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); - if (Src0->isImm() && !isInlineConstant(*Src0, 4)) - return nullptr; - break; - } + MachineInstr &MI, + LiveVariables *LV) const { + + switch (MI.getOpcode()) { + default: + return nullptr; + case AMDGPU::V_MAC_F32_e64: + break; + case AMDGPU::V_MAC_F32_e32: { + const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + return nullptr; + break; + } } - const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); - const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); - const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); - const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) - .addOperand(*Dst) - .addImm(0) // Src0 mods - .addOperand(*Src0) - .addImm(0) // Src1 mods - .addOperand(*Src1) - .addImm(0) // Src mods - .addOperand(*Src2) - .addImm(0) // clamp - .addImm(0); // omod + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + .addOperand(*Dst) + .addImm(0) // Src0 mods + .addOperand(*Src0) + .addImm(0) // Src1 mods + .addOperand(*Src1) + .addImm(0) // Src mods + .addOperand(*Src2) + .addImm(0) // clamp + .addImm(0); // omod +} + +bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // XXX - Do we want the SP check in the base implementation? + + // Target-independent instructions do not have an implicit-use of EXEC, even + // when they operate on VGPRs. Treating EXEC modifications as scheduling + // boundaries prevents incorrect movements of such instructions. + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || + MI.modifiesRegister(AMDGPU::EXEC, &RI); } bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { @@ -1355,9 +1531,9 @@ static bool compareMachineOp(const MachineOperand &Op0, } } -bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, - const MachineOperand &MO) const { - const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; +bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); @@ -1418,14 +1594,10 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, return true; // SGPRs use the constant bus - if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { - return true; - } - - return false; + return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); } static unsigned findImplicitSGPRRead(const MachineInstr &MI) { @@ -1448,10 +1620,33 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) { return AMDGPU::NoRegister; } -bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, +static bool shouldReadExec(const MachineInstr &MI) { + if (SIInstrInfo::isVALU(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READLANE_B32_si: + case AMDGPU::V_READLANE_B32_vi: + case AMDGPU::V_WRITELANE_B32: + case AMDGPU::V_WRITELANE_B32_si: + case AMDGPU::V_WRITELANE_B32_vi: + return false; + } + + return true; + } + + if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || + SIInstrInfo::isSALU(MI) || + SIInstrInfo::isSMRD(MI)) + return false; + + return true; +} + +bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { - uint16_t Opcode = MI->getOpcode(); - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + uint16_t Opcode = MI.getOpcode(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); @@ -1459,14 +1654,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); if (!Desc.isVariadic() && - Desc.getNumOperands() != MI->getNumExplicitOperands()) { - ErrInfo = "Instruction has wrong number of operands."; - return false; + Desc.getNumOperands() != MI.getNumExplicitOperands()) { + ErrInfo = "Instruction has wrong number of operands."; + return false; } // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { - if (MI->getOperand(i).isFPImm()) { + if (MI.getOperand(i).isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " "all fp values to integers."; return false; @@ -1476,7 +1671,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: - if (MI->getOperand(i).isImm()) { + if (MI.getOperand(i).isImm()) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -1484,17 +1679,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM32: break; case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI->getOperand(i), + if (isLiteralConstant(MI.getOperand(i), RI.getRegClass(RegClass)->getSize())) { ErrInfo = "Illegal immediate value for operand."; return false; } break; case MCOI::OPERAND_IMMEDIATE: + case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. // FrameIndex operands will be replaced by immediates, so they are // allowed. - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { + if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } @@ -1503,12 +1699,13 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, continue; } - if (!MI->getOperand(i).isReg()) + if (!MI.getOperand(i).isReg()) continue; if (RegClass != -1) { - unsigned Reg = MI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + unsigned Reg = MI.getOperand(i).getReg(); + if (Reg == AMDGPU::NoRegister || + TargetRegisterInfo::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); @@ -1519,23 +1716,26 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } - // Verify VOP* - if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { + if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; - unsigned SGPRUsed = findImplicitSGPRRead(*MI); + + if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) + ++ConstantBusCount; + + unsigned SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) ++ConstantBusCount; for (int OpIdx : OpIndices) { if (OpIdx == -1) break; - const MachineOperand &MO = MI->getOperand(OpIdx); + const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) @@ -1555,9 +1755,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { - const MachineOperand &Src0 = MI->getOperand(Src0Idx); - const MachineOperand &Src1 = MI->getOperand(Src1Idx); - const MachineOperand &Src2 = MI->getOperand(Src2Idx); + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &Src1 = MI.getOperand(Src1Idx); + const MachineOperand &Src2 = MI.getOperand(Src2Idx); if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { if (!compareMachineOp(Src0, Src1) && !compareMachineOp(Src0, Src2)) { @@ -1569,9 +1769,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. - if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { - const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); - if (!Exec || !Exec->isImplicit()) { + if (shouldReadExec(MI)) { + if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { ErrInfo = "VALU instruction does not implicitly read exec mask"; return false; } @@ -1624,22 +1823,18 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORD_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; + case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; + case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; + case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; + case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; + case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; + case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } } @@ -1676,12 +1871,12 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { } } -void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { +void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; - MachineBasicBlock *MBB = MI->getParent(); - MachineOperand &MO = MI->getOperand(OpIdx); + MachineBasicBlock *MBB = MI.getParent(); + MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; + unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (MO.isReg()) @@ -1689,7 +1884,6 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { else if (RI.isSGPRClass(RC)) Opcode = AMDGPU::S_MOV_B32; - const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) VRC = &AMDGPU::VReg_64RegClass; @@ -1698,8 +1892,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) - .addOperand(MO); + BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); MO.ChangeToRegister(Reg, false); } @@ -1758,11 +1951,11 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( } // Change the order of operands from (0, 1, 2) to (0, 2, 1) -void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { - assert(Inst->getNumExplicitOperands() == 3); - MachineOperand Op1 = Inst->getOperand(1); - Inst->RemoveOperand(1); - Inst->addOperand(Op1); +void SIInstrInfo::swapOperands(MachineInstr &Inst) const { + assert(Inst.getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst.getOperand(1); + Inst.RemoveOperand(1); + Inst.addOperand(Op1); } bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, @@ -1804,26 +1997,32 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return true; } -bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, +bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) - MO = &MI->getOperand(OpIdx); + MO = &MI.getOperand(OpIdx); + + if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + + RegSubRegPair SGPRUsed; + if (MO->isReg()) + SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); - if (isVALU(*MI) && - usesConstantBus(MRI, *MO, DefinedRC->getSize())) { - unsigned SGPRUsed = - MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; - const MachineOperand &Op = MI->getOperand(i); - if (Op.isReg() && Op.getReg() != SGPRUsed && - usesConstantBus(MRI, Op, getOpSize(*MI, i))) { + const MachineOperand &Op = MI.getOperand(i); + if (Op.isReg()) { + if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && + usesConstantBus(MRI, Op, getOpSize(MI, i))) { + return false; + } + } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { return false; } } @@ -1834,7 +2033,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, return isLegalRegOperand(MRI, OpInfo, *MO); } - // Handle non-register types that are treated like immediates. assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); @@ -1847,12 +2045,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, } void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, - MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); + MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 // we need to only have one constant bus use. @@ -1860,10 +2058,10 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // Note we do not need to worry about literal constants here. They are // disabled for the operand type for instructions because they will always // violate the one constant bus use rule. - bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; if (HasImplicitSGPR) { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) legalizeOpWithMove(MI, Src0Idx); @@ -1878,13 +2076,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste // compile time pointlessly swapping and checking legality again. - if (HasImplicitSGPR || !MI->isCommutable()) { + if (HasImplicitSGPR || !MI.isCommutable()) { legalizeOpWithMove(MI, Src1Idx); return; } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + MachineOperand &Src0 = MI.getOperand(Src0Idx); // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. @@ -1897,13 +2095,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } - int CommutedOpc = commuteOpcode(*MI); + int CommutedOpc = commuteOpcode(MI); if (CommutedOpc == -1) { legalizeOpWithMove(MI, Src1Idx); return; } - MI->setDesc(get(CommutedOpc)); + MI.setDesc(get(CommutedOpc)); unsigned Src0Reg = Src0.getReg(); unsigned Src0SubReg = Src0.getSubReg(); @@ -1925,10 +2123,9 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // operand, and since literal constants are not allowed and should never be // seen, we only need to worry about inserting copies if we use multiple SGPR // operands. -void SIInstrInfo::legalizeOperandsVOP3( - MachineRegisterInfo &MRI, - MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, + MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); int VOP3Idx[3] = { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), @@ -1943,7 +2140,7 @@ void SIInstrInfo::legalizeOperandsVOP3( int Idx = VOP3Idx[i]; if (Idx == -1) break; - MachineOperand &MO = MI->getOperand(Idx); + MachineOperand &MO = MI.getOperand(Idx); // We should never see a VOP3 instruction with an illegal immediate operand. if (!MO.isReg()) @@ -1964,32 +2161,78 @@ void SIInstrInfo::legalizeOperandsVOP3( } } -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); +unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, + MachineRegisterInfo &MRI) const { + const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); + unsigned DstReg = MRI.createVirtualRegister(SRC); + unsigned SubRegs = VRC->getSize() / 4; + + SmallVector<unsigned, 8> SRegs; + for (unsigned i = 0; i < SubRegs; ++i) { + unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), SGPR) + .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); + SRegs.push_back(SGPR); + } + + MachineInstrBuilder MIB = + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(AMDGPU::REG_SEQUENCE), DstReg); + for (unsigned i = 0; i < SubRegs; ++i) { + MIB.addReg(SRegs[i]); + MIB.addImm(RI.getSubRegFromChannel(i)); + } + return DstReg; +} + +void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, + MachineInstr &MI) const { + + // If the pointer is store in VGPRs, then we need to move them to + // SGPRs using v_readfirstlane. This is safe because we only select + // loads with uniform pointers to SMRD instruction so we know the + // pointer value is uniform. + MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); + if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } +} + +void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); // Legalize VOP2 - if (isVOP2(*MI)) { + if (isVOP2(MI) || isVOPC(MI)) { legalizeOperandsVOP2(MRI, MI); return; } // Legalize VOP3 - if (isVOP3(*MI)) { + if (isVOP3(MI)) { legalizeOperandsVOP3(MRI, MI); return; } + // Legalize SMRD + if (isSMRD(MI)) { + legalizeOperandsSMRD(MRI, MI); + return; + } + // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. - if (MI->getOpcode() == AMDGPU::PHI) { + if (MI.getOpcode() == AMDGPU::PHI) { const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { + if (!MI.getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; const TargetRegisterClass *OpRC = - MRI.getRegClass(MI->getOperand(i).getReg()); + MRI.getRegClass(MI.getOperand(i).getReg()); if (RI.hasVGPRs(OpRC)) { VRC = OpRC; } else { @@ -2000,7 +2243,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // If any of the operands are VGPR registers, then they all most be // otherwise we will create illegal VGPR->SGPR copies when legalizing // them. - if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { + if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); VRC = RI.getEquivalentVGPRClass(SRC); @@ -2011,18 +2254,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } // Update all the operands so they have the same type. - for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - MachineOperand &Op = MI->getOperand(I); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. - MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); + MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); + BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); Op.setReg(DstReg); } } @@ -2030,15 +2273,15 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // REG_SEQUENCE doesn't really require operand legalization, but if one has a // VGPR dest type and SGPR sources, insert copies so all operands are // VGPRs. This seems to help operand folding / the register coalescer. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - MachineBasicBlock *MBB = MI->getParent(); - const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); + if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); if (RI.hasVGPRs(DstRC)) { // Update all the operands so they are VGPR register classes. These may // not be the same register class because REG_SEQUENCE supports mixing // subregister index types e.g. sub0_sub1 + sub2 + sub3 - for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - MachineOperand &Op = MI->getOperand(I); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; @@ -2049,8 +2292,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { unsigned DstReg = MRI.createVirtualRegister(VRC); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); + BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); Op.setReg(DstReg); Op.setIsKill(); @@ -2062,17 +2305,33 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // Legalize INSERT_SUBREG // src0 must have the same register class as dst - if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned Src0 = MI->getOperand(1).getReg(); + if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src0 = MI.getOperand(1).getReg(); const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI->getOperand(1).setReg(NewSrc0); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI.getOperand(1).setReg(NewSrc0); + } + return; + } + + // Legalize MIMG + if (isMIMG(MI)) { + MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); + if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); + SRsrc->setReg(SGPR); + } + + MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); + if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); + SSamp->setReg(SGPR); } return; } @@ -2081,11 +2340,11 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // FIXME: If we start using the non-addr64 instructions for compute, we // may need to legalize them here. int SRsrcIdx = - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); if (SRsrcIdx != -1) { // We have an MUBUF instruction - MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); - unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), RI.getRegClass(SRsrcRC))) { // The operands are legal. @@ -2093,7 +2352,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { return; } - MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); // Extract the ptr from the resource descriptor. unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, @@ -2107,30 +2366,27 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) - .addImm(0); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) + .addImm(0); // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) - .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) - .addImm(RsrcDataFormat >> 32); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); - - MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer @@ -2139,7 +2395,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addReg(VAddr->getReg(), 0, AMDGPU::sub0); @@ -2150,82 +2406,82 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { .addReg(VAddr->getReg(), 0, AMDGPU::sub1); // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. - assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() - < AMDGPUSubtarget::VOLCANIC_ISLANDS && + assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() + < SISubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); - MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); - MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); - MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); // Atomics rith return have have an additional tied operand and are // missing some of the special bits. - MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); + MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); MachineInstr *Addr64; if (!VDataIn) { // Regular buffer load / store. - MachineInstrBuilder MIB - = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset); + MachineInstrBuilder MIB = + BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset); // Atomics do not have this operand. - if (const MachineOperand *GLC - = getNamedOperand(*MI, AMDGPU::OpName::glc)) { + if (const MachineOperand *GLC = + getNamedOperand(MI, AMDGPU::OpName::glc)) { MIB.addImm(GLC->getImm()); } - MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); - if (const MachineOperand *TFE - = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { + if (const MachineOperand *TFE = + getNamedOperand(MI, AMDGPU::OpName::tfe)) { MIB.addImm(TFE->getImm()); } - MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); Addr64 = MIB; } else { // Atomics with return. - Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addOperand(*VDataIn) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*VDataIn) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) + .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } - MI->removeFromParent(); - MI = Addr64; + MI.removeFromParent(); // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(SRsrcPtr, 0, AMDGPU::sub0) - .addImm(AMDGPU::sub0) - .addReg(SRsrcPtr, 0, AMDGPU::sub1) - .addImm(AMDGPU::sub1); - - VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); - SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + + VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); } // Update the instruction to use NewVaddr @@ -2235,300 +2491,85 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } -void SIInstrInfo::splitSMRD(MachineInstr *MI, - const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const { - - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RegLo = MRI.createVirtualRegister(HalfRC); - unsigned RegHi = MRI.createVirtualRegister(HalfRC); - unsigned HalfSize = HalfRC->getSize(); - const MachineOperand *OffOp = - getNamedOperand(*MI, AMDGPU::OpName::offset); - const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); - - // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes - // on VI. - - bool IsKill = SBase->isKill(); - if (OffOp) { - bool isVI = - MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS; - unsigned OffScale = isVI ? 1 : 4; - // Handle the _IMM variant - unsigned LoOffset = OffOp->getImm() * OffScale; - unsigned HiOffset = LoOffset + HalfSize; - Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - // Use addReg instead of addOperand - // to make sure kill flag is cleared. - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addImm(LoOffset / OffScale); - - if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { - unsigned OffsetSGPR = - MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) - .addImm(HiOffset); // The offset in register is in bytes. - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } else { - Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addImm(HiOffset / OffScale); - } - } else { - // Handle the _SGPR variant - MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); - Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addOperand(*SOff); - unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addReg(SOff->getReg(), 0, SOff->getSubReg()) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } - - unsigned SubLo, SubHi; - const TargetRegisterClass *NewDstRC; - switch (HalfSize) { - case 4: - SubLo = AMDGPU::sub0; - SubHi = AMDGPU::sub1; - NewDstRC = &AMDGPU::VReg_64RegClass; - break; - case 8: - SubLo = AMDGPU::sub0_sub1; - SubHi = AMDGPU::sub2_sub3; - NewDstRC = &AMDGPU::VReg_128RegClass; - break; - case 16: - SubLo = AMDGPU::sub0_sub1_sub2_sub3; - SubHi = AMDGPU::sub4_sub5_sub6_sub7; - NewDstRC = &AMDGPU::VReg_256RegClass; - break; - case 32: - SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; - SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; - NewDstRC = &AMDGPU::VReg_512RegClass; - break; - default: - llvm_unreachable("Unhandled HalfSize"); - } - - unsigned OldDst = MI->getOperand(0).getReg(); - unsigned NewDst = MRI.createVirtualRegister(NewDstRC); - - MRI.replaceRegWith(OldDst, NewDst); - - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); -} - -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, - MachineRegisterInfo &MRI, - SmallVectorImpl<MachineInstr *> &Worklist) const { - MachineBasicBlock *MBB = MI->getParent(); - int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; - switch(RI.getRegClass(DstRCID)->getSize()) { - case 4: - case 8: - case 16: { - unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; - - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; - } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); - ImmOffset = 0; - } - } - - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - - const MCInstrDesc &NewInstDesc = get(NewOpcode); - const TargetRegisterClass *NewDstRC - = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - unsigned DstReg = MI->getOperand(0).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - - MachineInstr *NewInst = - BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) - .addOperand(MI->getOperand(1)) // sbase - .addReg(SRsrc) - .addImm(0) - .addImm(ImmOffset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); - MI->eraseFromParent(); - - legalizeOperands(NewInst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); - break; - } - case 32: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, - AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - - case 64: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, - AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - } -} - void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector<MachineInstr *, 128> Worklist; Worklist.push_back(&TopInst); while (!Worklist.empty()) { - MachineInstr *Inst = Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst->getParent(); + MachineInstr &Inst = *Worklist.pop_back_val(); + MachineBasicBlock *MBB = Inst.getParent(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Opcode = Inst->getOpcode(); - unsigned NewOpcode = getVALUOp(*Inst); + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = getVALUOp(Inst); // Handle some special cases switch (Opcode) { default: - if (isSMRD(*Inst)) { - moveSMRDToVALU(Inst, MRI, Worklist); - continue; - } break; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_BCNT1_I32_B64: splitScalar64BitBCNT(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_BFE_I64: { splitScalar64BitBFE(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; } case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B64; swapOperands(Inst); } @@ -2536,9 +2577,18 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_ABS_I32: lowerScalarAbs(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: + // Clear unused bits of vcc + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), + AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + break; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2553,34 +2603,36 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Use the new VALU Opcode. const MCInstrDesc &NewDesc = get(NewOpcode); - Inst->setDesc(NewDesc); + Inst.setDesc(NewDesc); // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. - for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { - MachineOperand &Op = Inst->getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) - Inst->RemoveOperand(i); + for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { + MachineOperand &Op = Inst.getOperand(i); + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { + Inst.RemoveOperand(i); + addSCCDefUsersToVALUWorklist(Inst, Worklist); + } } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(Size)); + Inst.addOperand(MachineOperand::CreateImm(0)); + Inst.addOperand(MachineOperand::CreateImm(Size)); } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { // The VALU version adds the second operand to the result, so insert an // extra 0 operand. - Inst->addOperand(MachineOperand::CreateImm(0)); + Inst.addOperand(MachineOperand::CreateImm(0)); } - Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); + Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + const MachineOperand &OffsetWidthOp = Inst.getOperand(2); // If we need to move this to VGPRs, we need to unpack the second operand // back into the 2 separate ones for bit offset and width. assert(OffsetWidthOp.isImm() && @@ -2589,50 +2641,41 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst->RemoveOperand(2); // Remove old immediate. - Inst->addOperand(MachineOperand::CreateImm(Offset)); - Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + Inst.RemoveOperand(2); // Remove old immediate. + Inst.addOperand(MachineOperand::CreateImm(Offset)); + Inst.addOperand(MachineOperand::CreateImm(BitWidth)); } - // Update the destination register class. - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); - if (!NewDstRC) - continue; + bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); + unsigned NewDstReg = AMDGPU::NoRegister; + if (HasDst) { + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); + if (!NewDstRC) + continue; - unsigned DstReg = Inst->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); + unsigned DstReg = Inst.getOperand(0).getReg(); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } // Legalize the operands legalizeOperands(Inst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (HasDst) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } -//===----------------------------------------------------------------------===// -// Indirect addressing callbacks -//===----------------------------------------------------------------------===// - -unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - assert(Channel == 0); - return RegIndex; -} - -const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::VGPR_32RegClass; -} - void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -2649,15 +2692,14 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, } void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); + SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - DebugLoc DL = Inst->getDebugLoc(); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -2703,16 +2745,15 @@ void SIInstrInfo::splitScalar64BitUnaryOp( } void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); + SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - MachineOperand &Src1 = Inst->getOperand(2); - DebugLoc DL = Inst->getDebugLoc(); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -2738,9 +2779,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); + MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0) + .addOperand(SrcReg1Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); @@ -2748,9 +2789,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp( AMDGPU::sub1, Src1SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); + MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1) + .addOperand(SrcReg1Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -2770,16 +2811,16 @@ void SIInstrInfo::splitScalar64BitBinaryOp( addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); +void SIInstrInfo::splitScalar64BitBCNT( + SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src = Inst.getOperand(1); const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); const TargetRegisterClass *SrcRC = Src.isReg() ? @@ -2812,24 +2853,22 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist } void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - uint32_t Imm = Inst->getOperand(2).getImm(); + MachineOperand &Dest = Inst.getOperand(0); + uint32_t Imm = Inst.getOperand(2).getImm(); uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. (void) Offset; // Only sext_inreg cases handled. - assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && - BitWidth <= 32 && - Offset == 0 && - "Not implemented"); + assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && + Offset == 0 && "Not implemented"); if (BitWidth < 32) { unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -2837,9 +2876,9 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) - .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) - .addImm(0) - .addImm(BitWidth); + .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) .addImm(31) @@ -2856,7 +2895,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, return; } - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); @@ -2887,6 +2926,22 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::addSCCDefUsersToVALUWorklist( + MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { + // This assumes that all the users of SCC are in the same block + // as the SCC def. + for (MachineInstr &MI : + llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), + SCCDefInst.getParent()->end())) { + // Exit if we find another SCC def. + if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) + return; + + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) + Worklist.push_back(&MI); + } +} + const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( const MachineInstr &Inst) const { const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); @@ -2912,9 +2967,9 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( } // Find the one SGPR operand we are allowed to use. -unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const { - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); // Find the one SGPR operand we are allowed to use. // @@ -2925,19 +2980,19 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // // If the operand's class is an SGPR, we can never move it. - unsigned SGPRReg = findImplicitSGPRRead(*MI); + unsigned SGPRReg = findImplicitSGPRRead(MI); if (SGPRReg != AMDGPU::NoRegister) return SGPRReg; unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { int Idx = OpIndices[i]; if (Idx == -1) break; - const MachineOperand &MO = MI->getOperand(Idx); + const MachineOperand &MO = MI.getOperand(Idx); if (!MO.isReg()) continue; @@ -2981,70 +3036,6 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, return SGPRReg; } -MachineInstrBuilder SIInstrInfo::buildIndirectWrite( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) - .addReg(IndirectBaseReg, RegState::Define) - .addOperand(I->getOperand(0)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0) - .addReg(ValueReg); -} - -MachineInstrBuilder SIInstrInfo::buildIndirectRead( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0); - -} - -void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { - int End = getIndirectIndexEnd(MF); - int Begin = getIndirectIndexBegin(MF); - - if (End == -1) - return; - - - for (int Index = Begin; Index <= End; ++Index) - Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); -} - MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, unsigned OperandName) const { int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); @@ -3059,9 +3050,9 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { if (ST.isAmdHsaOS()) { RsrcDataFormat |= (1ULL << 56); - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - // Set MTYPE = 2 - RsrcDataFormat |= (2ULL << 59); + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + // Set MTYPE = 2 + RsrcDataFormat |= (2ULL << 59); } return RsrcDataFormat; @@ -3072,22 +3063,103 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + + Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | + // IndexStride = 64 + (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); + // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; } -bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); return isSMRD(Opc); } -bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); } + +unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); + unsigned DescSize = Desc.getSize(); + + // If we have a definitive size, we can use it. Otherwise we need to inspect + // the operands to know the size. + if (DescSize == 8 || DescSize == 4) + return DescSize; + + assert(DescSize == 0); + + // 4-byte instructions may have a 32-bit literal encoded after them. Check + // operands that coud ever be literals. + if (isVALU(MI) || isSALU(MI)) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return 4; // No operands. + + if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) + return 8; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return 4; + + if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) + return 8; + + return 4; + } + + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::DBG_VALUE: + case TargetOpcode::BUNDLE: + case TargetOpcode::EH_LABEL: + return 0; + case TargetOpcode::INLINEASM: { + const MachineFunction *MF = MI.getParent()->getParent(); + const char *AsmStr = MI.getOperand(0).getSymbolName(); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + } + default: + llvm_unreachable("unable to find instruction size"); + } +} + +ArrayRef<std::pair<int, const char *>> +SIInstrInfo::getSerializableTargetIndices() const { + static const std::pair<int, const char *> TargetIndices[] = { + {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; + return makeArrayRef(TargetIndices); +} + +/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The +/// post-RA version of misched uses CreateTargetMIHazardRecognizer. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + return new GCNHazardRecognizer(DAG->MF); +} + +/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer +/// pass. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { + return new GCNHazardRecognizer(MF); +} diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index cce1ae725611..227b817227c2 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -13,8 +13,8 @@ //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_SIINSTRINFO_H -#define LLVM_LIB_TARGET_R600_SIINSTRINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H +#define LLVM_LIB_TARGET_AMDGPU_SIINSTRINFO_H #include "AMDGPUInstrInfo.h" #include "SIDefines.h" @@ -22,9 +22,24 @@ namespace llvm { -class SIInstrInfo : public AMDGPUInstrInfo { +class SIInstrInfo final : public AMDGPUInstrInfo { private: const SIRegisterInfo RI; + const SISubtarget &ST; + + // The the inverse predicate should have the negative value. + enum BranchPredicate { + INVALID_BR = 0, + SCC_TRUE = 1, + SCC_FALSE = -1, + VCCNZ = 2, + VCCZ = -2, + EXECNZ = -3, + EXECZ = 3 + }; + + static unsigned getBranchOpcode(BranchPredicate Cond); + static BranchPredicate getBranchPredicate(unsigned Opcode); unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, @@ -39,87 +54,89 @@ private: unsigned SubIdx, const TargetRegisterClass *SubRC) const; - void swapOperands(MachineBasicBlock::iterator Inst) const; + void swapOperands(MachineInstr &Inst) const; void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const; + MachineInstr &Inst) const; void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, unsigned Opcode) const; + MachineInstr &Inst, unsigned Opcode) const; void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, unsigned Opcode) const; + MachineInstr &Inst, unsigned Opcode) const; void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const; + MachineInstr &Inst) const; void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const; + MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist( unsigned Reg, MachineRegisterInfo &MRI, SmallVectorImpl<MachineInstr *> &Worklist) const; + void + addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, + SmallVectorImpl<MachineInstr *> &Worklist) const; + const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; - bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const; + bool checkInstOffsetsDoNotOverlap(MachineInstr &MIa, MachineInstr &MIb) const; - unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; + unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: - MachineInstr *commuteInstructionImpl(MachineInstr *MI, - bool NewMI, + MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override; public: - explicit SIInstrInfo(const AMDGPUSubtarget &st); - const SIRegisterInfo &getRegisterInfo() const override { + enum TargetOperandFlags { + MO_NONE = 0, + MO_GOTPCREL = 1 + }; + + explicit SIInstrInfo(const SISubtarget &); + + const SIRegisterInfo &getRegisterInfo() const { return RI; } - bool isReallyTriviallyReMaterializable(const MachineInstr *MI, + bool isReallyTriviallyReMaterializable(const MachineInstr &MI, AliasAnalysis *AA) const override; bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; - bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, + bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const final; - bool shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const final; + bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + unsigned NumLoads) const final; - void copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; - unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, - unsigned TmpReg, - unsigned Offset, - unsigned Size) const; + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, MachineInstr &MI, + RegScavenger *RS, unsigned TmpReg, + unsigned Offset, unsigned Size) const; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, + MachineBasicBlock::iterator MI, unsigned SrcReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, + MachineBasicBlock::iterator MI, unsigned DestReg, + int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + bool expandPostRAPseudo(MachineInstr &MI) const override; // \brief Returns an opcode that can be used to move a value to a \p DstRC // register. If there is no hardware instruction that can store to \p @@ -129,28 +146,40 @@ public: LLVM_READONLY int commuteOpcode(const MachineInstr &MI) const; - bool findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, + bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; - bool areMemAccessesTriviallyDisjoint( - MachineInstr *MIa, MachineInstr *MIb, - AliasAnalysis *AA = nullptr) const override; + bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const override; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const override; - MachineInstr *buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, unsigned SrcReg) const override; - bool isMov(unsigned Opcode) const override; + bool ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const override; - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const final; + bool + areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, + AliasAnalysis *AA = nullptr) const override; + + bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, + MachineRegisterInfo *MRI) const final; unsigned getMachineCSELookAheadLimit() const override { return 500; } MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB, - MachineBasicBlock::iterator &MI, + MachineInstr &MI, LiveVariables *LV) const override; + bool isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + static bool isSALU(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SALU; } @@ -167,6 +196,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VALU; } + static bool isVMEM(const MachineInstr &MI) { + return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI); + } + + bool isVMEM(uint16_t Opcode) const { + return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode); + } + static bool isSOP1(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::SOP1; } @@ -279,6 +316,14 @@ public: return get(Opcode).TSFlags & SIInstrFlags::MIMG; } + static bool isGather4(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::Gather4; + } + + bool isGather4(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::Gather4; + } + static bool isFLAT(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::FLAT; } @@ -303,11 +348,35 @@ public: return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } + static bool isDPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DPP; + } + + bool isDPP(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::DPP; + } + + static bool isScalarUnit(const MachineInstr &MI) { + return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD); + } + + static bool usesVM_CNT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VM_CNT; + } + + bool isVGPRCopy(const MachineInstr &MI) const { + assert(MI.isCopy()); + unsigned Dest = MI.getOperand(0).getReg(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + return !RI.isSGPRReg(MRI, Dest); + } + bool isInlineConstant(const APInt &Imm) const; bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; - bool isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, + bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const; /// \brief Return true if this 64-bit VALU instruction has a 32-bit encoding. @@ -326,7 +395,7 @@ public: bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const; - bool verifyInstruction(const MachineInstr *MI, + bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override; static unsigned getVALUOp(const MachineInstr &MI); @@ -374,11 +443,11 @@ public: /// /// If the operand being legalized is a register, then a COPY will be used /// instead of MOV. - void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const; + void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const; /// \brief Check if \p MO is a legal operand if it was the \p OpIdx Operand /// for \p MI. - bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, + bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO = nullptr) const; /// \brief Check if \p MO would be a valid operand for the given operand @@ -396,52 +465,38 @@ public: /// \brief Legalize operands in \p MI by either commuting it or inserting a /// copy of src1. - void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const; + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const; /// \brief Fix operands in \p MI to satisfy constant bus requirements. - void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const; + void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const; - /// \brief Legalize all operands in this instruction. This function may - /// create new instruction and insert them before \p MI. - void legalizeOperands(MachineInstr *MI) const; + /// Copy a value from a VGPR (\p SrcReg) to SGPR. This function can only + /// be used when it is know that the value in SrcReg is same across all + /// threads in the wave. + /// \returns The SGPR register that \p SrcReg was copied to. + unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, + MachineRegisterInfo &MRI) const; - /// \brief Split an SMRD instruction into two smaller loads of half the - // size storing the results in \p Lo and \p Hi. - void splitSMRD(MachineInstr *MI, const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const; + void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const; - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, - SmallVectorImpl<MachineInstr *> &Worklist) const; + /// \brief Legalize all operands in this instruction. This function may + /// create new instruction and insert them before \p MI. + void legalizeOperands(MachineInstr &MI) const; /// \brief Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. void moveToVALU(MachineInstr &MI) const; - unsigned calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const override; - - const TargetRegisterClass *getIndirectAddrRegClass() const override; + void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI, + int Count) const; - MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; - MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, - unsigned OffsetReg) const override; - void reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const; - - void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, - unsigned SavReg, unsigned IndexReg) const; - - void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const; + /// \brief Return the number of wait states that result from executing this + /// instruction. + unsigned getNumWaitStates(const MachineInstr &MI) const; /// \brief Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. @@ -463,8 +518,26 @@ public: uint64_t getDefaultRsrcDataFormat() const; uint64_t getScratchRsrcWords23() const; - bool isLowLatencyInstruction(const MachineInstr *MI) const; - bool isHighLatencyInstruction(const MachineInstr *MI) const; + bool isLowLatencyInstruction(const MachineInstr &MI) const; + bool isHighLatencyInstruction(const MachineInstr &MI) const; + + /// \brief Return the descriptor of the target-specific machine instruction + /// that corresponds to the specified pseudo or native opcode. + const MCInstrDesc &getMCOpcodeFromPseudo(unsigned Opcode) const { + return get(pseudoToMCOpcode(Opcode)); + } + + unsigned getInstSizeInBytes(const MachineInstr &MI) const; + + ArrayRef<std::pair<int, const char *>> + getSerializableTargetIndices() const override; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const override; + + ScheduleHazardRecognizer * + CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; }; namespace AMDGPU { @@ -490,8 +563,9 @@ namespace AMDGPU { int getAtomicNoRetOp(uint16_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; - const uint64_t RSRC_TID_ENABLE = 1LL << 55; - + const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19); + const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21); + const uint64_t RSRC_TID_ENABLE = UINT64_C(1) << (32 + 23); } // End namespace AMDGPU namespace SI { diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 8735277149a6..253cc32b27e4 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -7,9 +7,9 @@ // //===----------------------------------------------------------------------===// def isCI : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SEA_ISLANDS">; + ">= SISubtarget::SEA_ISLANDS">; def isCIOnly : Predicate<"Subtarget->getGeneration() ==" - "AMDGPUSubtarget::SEA_ISLANDS">, + "SISubtarget::SEA_ISLANDS">, AssemblerPredicate <"FeatureSeaIslands">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; @@ -69,6 +69,11 @@ class sopk <bits<5> si, bits<5> vi = si> { field bits<5> VI = vi; } +class dsop <bits<8> si, bits<8> vi = si> { + field bits<8> SI = si; + field bits<8> VI = vi; +} + // Specify an SMRD opcode for SI and SMEM opcode for VI // FIXME: This should really be bits<5> si, Tablegen crashes if @@ -78,9 +83,9 @@ class smrd<bits<8> si, bits<8> vi = si> { field bits<8> VI = vi; } -// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum -// in AMDGPUInstrInfo.cpp -def SISubtarget { +// Execpt for the NONE field, this must be kept in sync with the +// SIEncodingFamily enum in AMDGPUInstrInfo.cpp +def SIEncodingFamily { int NONE = -1; int SI = 0; int VI = 1; @@ -95,6 +100,14 @@ def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT", [SDNPMayLoad, SDNPMemOperand] >; +def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTypeProfile<0, 13, [SDTCisVT<0, v4i32>, // rsrc(SGPR) @@ -120,7 +133,7 @@ def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT", >; class SDSample<string opcode> : SDNode <opcode, - SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>, + SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>, SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]> >; @@ -129,9 +142,8 @@ def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">; def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; -def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>, - SDTCisVT<0, i64>]> +def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", + SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> >; //===----------------------------------------------------------------------===// @@ -140,12 +152,14 @@ def SIconstdata_ptr : SDNode< class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr), (ld node:$ptr), [{ - return isFlatLoad(dyn_cast<LoadSDNode>(N)) || - isGlobalLoad(dyn_cast<LoadSDNode>(N)) || - isConstantLoad(cast<LoadSDNode>(N), -1); + const MemSDNode *LD = cast<MemSDNode>(N); + return LD->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; }]>; def flat_load : flat_ld <load>; +def atomic_flat_load : flat_ld<atomic_load>; def flat_az_extloadi8 : flat_ld <az_extloadi8>; def flat_sextloadi8 : flat_ld <sextloadi8>; def flat_az_extloadi16 : flat_ld <az_extloadi16>; @@ -153,26 +167,50 @@ def flat_sextloadi16 : flat_ld <sextloadi16>; class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr), (st node:$val, node:$ptr), [{ - return isFlatStore(dyn_cast<StoreSDNode>(N)) || - isGlobalStore(dyn_cast<StoreSDNode>(N)); + const MemSDNode *ST = cast<MemSDNode>(N); + return ST->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS || + ST->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; }]>; def flat_store: flat_st <store>; +def atomic_flat_store: flat_st <atomic_store>; def flat_truncstorei8 : flat_st <truncstorei8>; def flat_truncstorei16 : flat_st <truncstorei16>; +class MubufLoad <SDPatternOperator op> : PatFrag < + (ops node:$ptr), (op node:$ptr), [{ -def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ - return isGlobalLoad(cast<LoadSDNode>(N)) || - isConstantLoad(cast<LoadSDNode>(N), -1); + const MemSDNode *LD = cast<MemSDNode>(N); + return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; }]>; +def mubuf_load : MubufLoad <load>; +def mubuf_az_extloadi8 : MubufLoad <az_extloadi8>; +def mubuf_sextloadi8 : MubufLoad <sextloadi8>; +def mubuf_az_extloadi16 : MubufLoad <az_extloadi16>; +def mubuf_sextloadi16 : MubufLoad <sextloadi16>; + +def mubuf_load_atomic : MubufLoad <atomic_load>; + def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ - return isConstantLoad(cast<LoadSDNode>(N), -1) && - static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); + auto Ld = cast<LoadSDNode>(N); + return Ld->getAlignment() >= 4 && + Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); }]>; //===----------------------------------------------------------------------===// +// PatFrags for global memory operations +//===----------------------------------------------------------------------===// + +def atomic_inc_global : global_binary_atomic_op<SIatomic_inc>; +def atomic_dec_global : global_binary_atomic_op<SIatomic_dec>; + +def atomic_inc_flat : flat_binary_atomic_op<SIatomic_inc>; +def atomic_dec_flat : flat_binary_atomic_op<SIatomic_dec>; + +//===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. //===----------------------------------------------------------------------===// @@ -182,7 +220,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad, >; def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{ - return isLocalLoad(cast<LoadSDNode>(N)); + return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{ @@ -219,7 +257,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore, def si_st_local : PatFrag < (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{ - return isLocalStore(cast<StoreSDNode>(N)); + return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; }]>; def si_store_local : PatFrag < @@ -247,9 +285,34 @@ def si_truncstore_local_i16 : PatFrag < return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16; }]>; -multiclass SIAtomicM0Glue2 <string op_name> { +def si_setcc_uniform : PatFrag < + (ops node:$lhs, node:$rhs, node:$cond), + (setcc node:$lhs, node:$rhs, node:$cond), [{ + for (SDNode *Use : N->uses()) { + if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg) + return false; + + unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg(); + if (Reg != AMDGPU::SCC) + return false; + } + return true; +}]>; + +def si_uniform_br : PatFrag < + (ops node:$cond, node:$bb), (brcond node:$cond, node:$bb), [{ + return isUniformBr(N); +}]>; + +def si_uniform_br_scc : PatFrag < + (ops node:$cond, node:$bb), (si_uniform_br node:$cond, node:$bb), [{ + return isCBranchSCC(N); +}]>; + +multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> { - def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + def _glue : SDNode < + !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; @@ -257,11 +320,13 @@ multiclass SIAtomicM0Glue2 <string op_name> { } defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>; +defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; -defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; @@ -347,6 +412,10 @@ def IMM16bit : PatLeaf <(imm), [{return isUInt<16>(N->getZExtValue());}] >; +def SIMM16bit : PatLeaf <(imm), + [{return isInt<16>(N->getSExtValue());}] +>; + def IMM20bit : PatLeaf <(imm), [{return isUInt<20>(N->getZExtValue());}] >; @@ -369,7 +438,7 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{ }]>; class SGPRImm <dag frag> : PatLeaf<frag, [{ - if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) { return false; } const SIRegisterInfo *SIRI = @@ -402,188 +471,133 @@ def sopp_brtarget : Operand<OtherVT> { let ParserMatchClass = SoppBrTarget; } -def const_ga : Operand<iPTR>; - -include "SIInstrFormats.td" -include "VIInstrFormats.td" +def si_ga : Operand<iPTR>; -def MubufOffsetMatchClass : AsmOperandClass { - let Name = "MubufOffset"; - let ParserMethod = "parseMubufOptionalOps"; - let RenderMethod = "addImmOperands"; +def InterpSlot : Operand<i32> { + let PrintMethod = "printInterpSlot"; } -class DSOffsetBaseMatchClass <string parser> : AsmOperandClass { - let Name = "DSOffset"#parser; - let ParserMethod = parser; +def SendMsgMatchClass : AsmOperandClass { + let Name = "SendMsg"; + let PredicateMethod = "isSendMsg"; + let ParserMethod = "parseSendMsgOp"; let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset"; } -def DSOffsetMatchClass : DSOffsetBaseMatchClass <"parseDSOptionalOps">; -def DSOffsetGDSMatchClass : DSOffsetBaseMatchClass <"parseDSOffsetOptional">; - -def DSOffset01MatchClass : AsmOperandClass { - let Name = "DSOffset1"; - let ParserMethod = "parseDSOff01OptionalOps"; - let RenderMethod = "addImmOperands"; - let PredicateMethod = "isDSOffset01"; +def SendMsgImm : Operand<i32> { + let PrintMethod = "printSendMsg"; + let ParserMatchClass = SendMsgMatchClass; } -class GDSBaseMatchClass <string parser> : AsmOperandClass { - let Name = "GDS"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; +def SWaitMatchClass : AsmOperandClass { + let Name = "SWaitCnt"; let RenderMethod = "addImmOperands"; + let ParserMethod = "parseSWaitCntOps"; } -def GDSMatchClass : GDSBaseMatchClass <"parseDSOptionalOps">; -def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; - -class GLCBaseMatchClass <string parser> : AsmOperandClass { - let Name = "GLC"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; +def WAIT_FLAG : Operand <i32> { + let ParserMatchClass = SWaitMatchClass; + let PrintMethod = "printWaitFlag"; } -def GLCMubufMatchClass : GLCBaseMatchClass <"parseMubufOptionalOps">; -def GLCFlatMatchClass : GLCBaseMatchClass <"parseFlatOptionalOps">; +include "SIInstrFormats.td" +include "VIInstrFormats.td" -class SLCBaseMatchClass <string parser> : AsmOperandClass { - let Name = "SLC"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; +class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass { + let Name = "Imm"#CName; + let PredicateMethod = "is"#CName; + let ParserMethod = !if(Optional, "parseOptionalOperand", "parse"#CName); let RenderMethod = "addImmOperands"; + let IsOptional = Optional; + let DefaultMethod = !if(Optional, "default"#CName, ?); } -def SLCMubufMatchClass : SLCBaseMatchClass <"parseMubufOptionalOps">; -def SLCFlatMatchClass : SLCBaseMatchClass <"parseFlatOptionalOps">; -def SLCFlatAtomicMatchClass : SLCBaseMatchClass <"parseFlatAtomicOptionalOps">; - -class TFEBaseMatchClass <string parser> : AsmOperandClass { - let Name = "TFE"#parser; - let PredicateMethod = "isImm"; - let ParserMethod = parser; - let RenderMethod = "addImmOperands"; +class NamedOperandBit<string Name, AsmOperandClass MatchClass> : Operand<i1> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -def TFEMubufMatchClass : TFEBaseMatchClass <"parseMubufOptionalOps">; -def TFEFlatMatchClass : TFEBaseMatchClass <"parseFlatOptionalOps">; -def TFEFlatAtomicMatchClass : TFEBaseMatchClass <"parseFlatAtomicOptionalOps">; - -def OModMatchClass : AsmOperandClass { - let Name = "OMod"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; +class NamedOperandU8<string Name, AsmOperandClass MatchClass> : Operand<i8> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -def ClampMatchClass : AsmOperandClass { - let Name = "Clamp"; - let PredicateMethod = "isImm"; - let ParserMethod = "parseVOP3OptionalOps"; - let RenderMethod = "addImmOperands"; +class NamedOperandU16<string Name, AsmOperandClass MatchClass> : Operand<i16> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass { - let Name = "SMRDOffset"#predicate; - let PredicateMethod = predicate; - let RenderMethod = "addImmOperands"; +class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> { + let PrintMethod = "print"#Name; + let ParserMatchClass = MatchClass; } -def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">; -def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass < - "isSMRDLiteralOffset" ->; - let OperandType = "OPERAND_IMMEDIATE" in { -def offen : Operand<i1> { - let PrintMethod = "printOffen"; -} -def idxen : Operand<i1> { - let PrintMethod = "printIdxen"; -} -def addr64 : Operand<i1> { - let PrintMethod = "printAddr64"; -} -def mbuf_offset : Operand<i16> { - let PrintMethod = "printMBUFOffset"; - let ParserMatchClass = MubufOffsetMatchClass; -} -class ds_offset_base <AsmOperandClass mc> : Operand<i16> { - let PrintMethod = "printDSOffset"; - let ParserMatchClass = mc; -} -def ds_offset : ds_offset_base <DSOffsetMatchClass>; -def ds_offset_gds : ds_offset_base <DSOffsetGDSMatchClass>; +def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>; +def idxen : NamedOperandBit<"Idxen", NamedMatchClass<"Idxen">>; +def addr64 : NamedOperandBit<"Addr64", NamedMatchClass<"Addr64">>; -def ds_offset0 : Operand<i8> { - let PrintMethod = "printDSOffset0"; - let ParserMatchClass = DSOffset01MatchClass; -} -def ds_offset1 : Operand<i8> { - let PrintMethod = "printDSOffset1"; - let ParserMatchClass = DSOffset01MatchClass; -} -class gds_base <AsmOperandClass mc> : Operand <i1> { - let PrintMethod = "printGDS"; - let ParserMatchClass = mc; -} -def gds : gds_base <GDSMatchClass>; +def offset : NamedOperandU16<"Offset", NamedMatchClass<"Offset">>; +def offset0 : NamedOperandU8<"Offset0", NamedMatchClass<"Offset0">>; +def offset1 : NamedOperandU8<"Offset1", NamedMatchClass<"Offset1">>; -def gds01 : gds_base <GDS01MatchClass>; +def gds : NamedOperandBit<"GDS", NamedMatchClass<"GDS">>; -class glc_base <AsmOperandClass mc> : Operand <i1> { - let PrintMethod = "printGLC"; - let ParserMatchClass = mc; -} +def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>; +def clampmod : NamedOperandBit<"ClampSI", NamedMatchClass<"ClampSI">>; -def glc : glc_base <GLCMubufMatchClass>; -def glc_flat : glc_base <GLCFlatMatchClass>; +def smrd_offset : NamedOperandU32<"SMRDOffset", NamedMatchClass<"SMRDOffset">>; +def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset", NamedMatchClass<"SMRDLiteralOffset">>; -class slc_base <AsmOperandClass mc> : Operand <i1> { - let PrintMethod = "printSLC"; - let ParserMatchClass = mc; -} +def glc : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; +def slc : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; +def tfe : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def unorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; +def da : NamedOperandBit<"DA", NamedMatchClass<"DA">>; +def r128 : NamedOperandBit<"R128", NamedMatchClass<"R128">>; +def lwe : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>; -def slc : slc_base <SLCMubufMatchClass>; -def slc_flat : slc_base <SLCFlatMatchClass>; -def slc_flat_atomic : slc_base <SLCFlatAtomicMatchClass>; +def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>; -class tfe_base <AsmOperandClass mc> : Operand <i1> { - let PrintMethod = "printTFE"; - let ParserMatchClass = mc; -} +def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>; +def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>; +def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>; +def bound_ctrl : NamedOperandBit<"BoundCtrl", NamedMatchClass<"BoundCtrl">>; -def tfe : tfe_base <TFEMubufMatchClass>; -def tfe_flat : tfe_base <TFEFlatMatchClass>; -def tfe_flat_atomic : tfe_base <TFEFlatAtomicMatchClass>; +def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>; +def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; +def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; +def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>; -def omod : Operand <i32> { - let PrintMethod = "printOModSI"; - let ParserMatchClass = OModMatchClass; -} +def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>; + +} // End OperandType = "OPERAND_IMMEDIATE" -def ClampMod : Operand <i1> { - let PrintMethod = "printClampSI"; - let ParserMatchClass = ClampMatchClass; -} -def smrd_offset : Operand <i32> { - let PrintMethod = "printU32ImmOperand"; - let ParserMatchClass = SMRDOffsetMatchClass; +def VOPDstS64 : VOPDstOperand <SReg_64>; + +def FPInputModsMatchClass : AsmOperandClass { + let Name = "RegOrImmWithFPInputMods"; + let ParserMethod = "parseRegOrImmWithFPInputMods"; + let PredicateMethod = "isRegOrImmWithInputMods"; } -def smrd_literal_offset : Operand <i32> { - let PrintMethod = "printU32ImmOperand"; - let ParserMatchClass = SMRDLiteralOffsetMatchClass; +def FPInputMods : Operand <i32> { + let PrintMethod = "printOperandAndFPInputMods"; + let ParserMatchClass = FPInputModsMatchClass; } -} // End OperandType = "OPERAND_IMMEDIATE" +def IntInputModsMatchClass : AsmOperandClass { + let Name = "RegOrImmWithIntInputMods"; + let ParserMethod = "parseRegOrImmWithIntInputMods"; + let PredicateMethod = "isRegOrImmWithInputMods"; +} -def VOPDstS64 : VOPDstOperand <SReg_64>; +def IntInputMods: Operand <i32> { + let PrintMethod = "printOperandAndIntInputMods"; + let ParserMatchClass = IntInputModsMatchClass; +} //===----------------------------------------------------------------------===// // Complex patterns @@ -595,9 +609,13 @@ def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">; def MUBUFAddr64 : ComplexPattern<i64, 7, "SelectMUBUFAddr64">; def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">; +def FLATAtomic : ComplexPattern<i64, 3, "SelectFlat">; def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; +def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; +def MUBUFIntrinsicOffset : ComplexPattern<i32, 2, "SelectMUBUFIntrinsicOffset">; +def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset">; def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; @@ -606,6 +624,8 @@ def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; +def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">; + def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; @@ -670,17 +690,24 @@ class EXPCommon : InstSI< let EXP_CNT = 1; let Uses = [EXEC]; + let SchedRW = [WriteExport]; } multiclass EXP_m { let isPseudo = 1, isCodeGenOnly = 1 in { - def "" : EXPCommon, SIMCInstr <"exp", SISubtarget.NONE> ; + def "" : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.NONE> ; } - def _si : EXPCommon, SIMCInstr <"exp", SISubtarget.SI>, EXPe; + def _si : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.SI>, EXPe { + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; + } - def _vi : EXPCommon, SIMCInstr <"exp", SISubtarget.VI>, EXPe_vi; + def _vi : EXPCommon, SIMCInstr <"exp", SIEncodingFamily.VI>, EXPe_vi { + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; + } } //===----------------------------------------------------------------------===// @@ -689,7 +716,7 @@ multiclass EXP_m { class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : SOP1 <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -697,17 +724,21 @@ class SOP1_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : class SOP1_Real_si <sop1 op, string opName, dag outs, dag ins, string asm> : SOP1 <outs, ins, asm, []>, SOP1e <op.SI>, - SIMCInstr<opName, SISubtarget.SI> { + SIMCInstr<opName, SIEncodingFamily.SI> { let isCodeGenOnly = 0; let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class SOP1_Real_vi <sop1 op, string opName, dag outs, dag ins, string asm> : SOP1 <outs, ins, asm, []>, SOP1e <op.VI>, - SIMCInstr<opName, SISubtarget.VI> { + SIMCInstr<opName, SIEncodingFamily.VI> { let isCodeGenOnly = 0; let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm, @@ -722,27 +753,27 @@ multiclass SOP1_m <sop1 op, string opName, dag outs, dag ins, string asm, } multiclass SOP1_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0), - opName#" $dst, $src0", pattern + op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0), + opName#" $sdst, $src0", pattern >; multiclass SOP1_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern + op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0), + opName#" $sdst, $src0", pattern >; // no input, 64-bit output. multiclass SOP1_64_0 <sop1 op, string opName, list<dag> pattern> { - def "" : SOP1_Pseudo <opName, (outs SReg_64:$dst), (ins), pattern>; + def "" : SOP1_Pseudo <opName, (outs SReg_64:$sdst), (ins), pattern>; - def _si : SOP1_Real_si <op, opName, (outs SReg_64:$dst), (ins), - opName#" $dst"> { - let ssrc0 = 0; + def _si : SOP1_Real_si <op, opName, (outs SReg_64:$sdst), (ins), + opName#" $sdst"> { + let src0 = 0; } - def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$dst), (ins), - opName#" $dst"> { - let ssrc0 = 0; + def _vi : SOP1_Real_vi <op, opName, (outs SReg_64:$sdst), (ins), + opName#" $sdst"> { + let src0 = 0; } } @@ -763,13 +794,19 @@ multiclass SOP1_1 <sop1 op, string opName, list<dag> pattern> { // 64-bit input, 32-bit output. multiclass SOP1_32_64 <sop1 op, string opName, list<dag> pattern> : SOP1_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_64:$src0), - opName#" $dst, $src0", pattern + op, opName, (outs SReg_32:$sdst), (ins SSrc_64:$src0), + opName#" $sdst, $src0", pattern +>; + +// 32-bit input, 64-bit output. +multiclass SOP1_64_32 <sop1 op, string opName, list<dag> pattern> : SOP1_m < + op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0), + opName#" $sdst, $src0", pattern >; class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : SOP2<outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; let Size = 4; @@ -784,15 +821,19 @@ class SOP2_Pseudo<string opName, dag outs, dag ins, list<dag> pattern> : class SOP2_Real_si<sop2 op, string opName, dag outs, dag ins, string asm> : SOP2<outs, ins, asm, []>, SOP2e<op.SI>, - SIMCInstr<opName, SISubtarget.SI> { + SIMCInstr<opName, SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : SOP2<outs, ins, asm, []>, SOP2e<op.VI>, - SIMCInstr<opName, SISubtarget.VI> { + SIMCInstr<opName, SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, @@ -807,36 +848,49 @@ multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, } multiclass SOP2_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < - op, opName, (outs SReg_32:$dst), (ins SSrc_32:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern + op, opName, (outs SReg_32:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $sdst, $src0, $src1", pattern >; multiclass SOP2_64 <sop2 op, string opName, list<dag> pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_64:$src1), - opName#" $dst, $src0, $src1", pattern + op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_64:$src1), + opName#" $sdst, $src0, $src1", pattern >; multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < - op, opName, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1), - opName#" $dst, $src0, $src1", pattern + op, opName, (outs SReg_64:$sdst), (ins SSrc_64:$src0, SSrc_32:$src1), + opName#" $sdst, $src0, $src1", pattern >; -class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, - string opName, PatLeaf cond> : SOPC < - op, (outs), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []> { +multiclass SOP2_64_32_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < + op, opName, (outs SReg_64:$sdst), (ins SSrc_32:$src0, SSrc_32:$src1), + opName#" $sdst, $src0, $src1", pattern +>; + +class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1, + string opName, list<dag> pattern = []> : SOPC < + op, (outs), (ins rc0:$src0, rc1:$src1), + opName#" $src0, $src1", pattern > { let Defs = [SCC]; } +class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, + string opName, PatLeaf cond> : SOPC_Base < + op, rc, rc, opName, + [(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > { +} -class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> +class SOPC_CMP_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_32, i32, opName, cond>; -class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL> - : SOPC_Helper<op, SSrc_64, i64, opName, cond>; +class SOPC_32<bits<7> op, string opName, list<dag> pattern = []> + : SOPC_Base<op, SSrc_32, SSrc_32, opName, pattern>; + +class SOPC_64_32<bits<7> op, string opName, list<dag> pattern = []> + : SOPC_Base<op, SSrc_64, SSrc_32, opName, pattern>; class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : SOPK <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -844,16 +898,20 @@ class SOPK_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : class SOPK_Real_si <sopk op, string opName, dag outs, dag ins, string asm> : SOPK <outs, ins, asm, []>, SOPKe <op.SI>, - SIMCInstr<opName, SISubtarget.SI> { + SIMCInstr<opName, SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; let isCodeGenOnly = 0; } class SOPK_Real_vi <sopk op, string opName, dag outs, dag ins, string asm> : SOPK <outs, ins, asm, []>, SOPKe <op.VI>, - SIMCInstr<opName, SISubtarget.VI> { + SIMCInstr<opName, SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; let isCodeGenOnly = 0; } @@ -868,14 +926,14 @@ multiclass SOPK_m <sopk op, string opName, dag outs, dag ins, string opAsm, } multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { - def "" : SOPK_Pseudo <opName, (outs SReg_32:$dst), (ins u16imm:$src0), + def "" : SOPK_Pseudo <opName, (outs SReg_32:$sdst), (ins u16imm:$simm16), pattern>; - def _si : SOPK_Real_si <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), - opName#" $dst, $src0">; + def _si : SOPK_Real_si <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16), + opName#" $sdst, $simm16">; - def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$dst), (ins u16imm:$src0), - opName#" $dst, $src0">; + def _vi : SOPK_Real_vi <op, opName, (outs SReg_32:$sdst), (ins u16imm:$simm16), + opName#" $sdst, $simm16">; } multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { @@ -908,15 +966,19 @@ multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins, def _si : SOPK <outs, ins, asm, []>, SOPK64e <op.SI>, - SIMCInstr<opName, SISubtarget.SI> { + SIMCInstr<opName, SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; let isCodeGenOnly = 0; } def _vi : SOPK <outs, ins, asm, []>, SOPK64e <op.VI>, - SIMCInstr<opName, SISubtarget.VI> { + SIMCInstr<opName, SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; let isCodeGenOnly = 0; } } @@ -926,86 +988,145 @@ multiclass SOPK_IMM32 <sopk op, string opName, dag outs, dag ins, class SMRD_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : SMRD <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } -class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, - string asm> : +class SMRD_IMM_Real_si <bits<5> op, string opName, dag outs, dag ins, + string asm> : + SMRD <outs, ins, asm, []>, + SMRD_IMMe <op>, + SIMCInstr<opName, SIEncodingFamily.SI> { + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class SMRD_SOFF_Real_si <bits<5> op, string opName, dag outs, dag ins, + string asm> : SMRD <outs, ins, asm, []>, - SMRDe <op, imm>, - SIMCInstr<opName, SISubtarget.SI> { + SMRD_SOFFe <op>, + SIMCInstr<opName, SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + + +class SMRD_IMM_Real_vi <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> : + SMRD <outs, ins, asm, pattern>, + SMEM_IMMe_vi <op>, + SIMCInstr<opName, SIEncodingFamily.VI> { + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } -class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, - string asm, list<dag> pattern = []> : +class SMRD_SOFF_Real_vi <bits<8> op, string opName, dag outs, dag ins, + string asm, list<dag> pattern = []> : SMRD <outs, ins, asm, pattern>, - SMEMe_vi <op, imm>, - SIMCInstr<opName, SISubtarget.VI> { + SMEM_SOFFe_vi <op>, + SIMCInstr<opName, SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } -multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins, + +multiclass SMRD_IMM_m <smrd op, string opName, dag outs, dag ins, string asm, list<dag> pattern> { def "" : SMRD_Pseudo <opName, outs, ins, pattern>; - def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>; + def _si : SMRD_IMM_Real_si <op.SI, opName, outs, ins, asm>; // glc is only applicable to scalar stores, which are not yet // implemented. let glc = 0 in { - def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>; + def _vi : SMRD_IMM_Real_vi <op.VI, opName, outs, ins, asm>; } } -multiclass SMRD_Inval <smrd op, string opName, - SDPatternOperator node> { - let hasSideEffects = 1, mayStore = 1 in { - def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>; +multiclass SMRD_SOFF_m <smrd op, string opName, dag outs, dag ins, + string asm, list<dag> pattern> { - let sbase = 0, offset = 0 in { - let sdst = 0 in { - def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>; - } + def "" : SMRD_Pseudo <opName, outs, ins, pattern>; + + def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, ins, asm>; + + // glc is only applicable to scalar stores, which are not yet + // implemented. + let glc = 0 in { + def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, ins, asm>; + } +} + +multiclass SMRD_Special <smrd op, string opName, dag outs, + int sdst_ = ?, + string opStr = "", + list<dag> pattern = []> { + let hasSideEffects = 1 in { + def "" : SMRD_Pseudo <opName, outs, (ins), pattern>; + + let sbase = 0, soff = 0, sdst = sdst_ in { + def _si : SMRD_SOFF_Real_si <op.SI, opName, outs, (ins), opName#opStr>; - let glc = 0, sdata = 0 in { - def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>; + let glc = 0 in { + def _vi : SMRD_SOFF_Real_vi <op.VI, opName, outs, (ins), opName#opStr>; } } } } +multiclass SMRD_Inval <smrd op, string opName, + SDPatternOperator node> { + let mayStore = 1 in { + defm : SMRD_Special<op, opName, (outs), 0, "", [(node)]>; + } +} + class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> : - SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> { + SMRD_SOFF_Real_vi<op, opName, (outs), (ins), opName, [(node)]> { let hasSideEffects = 1; let mayStore = 1; let sbase = 0; - let sdata = 0; + let sdst = 0; + let glc = 0; + let soff = 0; +} + +class SMEM_Ret <bits<8> op, string opName, SDPatternOperator node> : + SMRD_SOFF_Real_vi<op, opName, (outs SReg_64:$sdst), (ins), + opName#" $sdst", [(set i64:$sdst, (node))]> { + let hasSideEffects = 1; + let mayStore = ?; + let mayLoad = ?; + let sbase = 0; let glc = 0; - let offset = 0; + let soff = 0; } multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass, RegisterClass dstClass> { - defm _IMM : SMRD_m < - op, opName#"_IMM", 1, (outs dstClass:$dst), + defm _IMM : SMRD_IMM_m < + op, opName#"_IMM", (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_offset:$offset), - opName#" $dst, $sbase, $offset", [] + opName#" $sdst, $sbase, $offset", [] >; def _IMM_ci : SMRD < - (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset), - opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> { + (outs dstClass:$sdst), (ins baseClass:$sbase, smrd_literal_offset:$offset), + opName#" $sdst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> { let AssemblerPredicates = [isCIOnly]; + let DecoderNamespace = "CI"; } - defm _SGPR : SMRD_m < - op, opName#"_SGPR", 0, (outs dstClass:$dst), + defm _SGPR : SMRD_SOFF_m < + op, opName#"_SGPR", (outs dstClass:$sdst), (ins baseClass:$sbase, SReg_32:$soff), - opName#" $dst, $sbase, $soff", [] + opName#" $sdst, $sbase, $soff", [] >; } @@ -1013,20 +1134,6 @@ multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass, // Vector ALU classes //===----------------------------------------------------------------------===// -// This must always be right before the operand being input modified. -def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> { - let PrintMethod = "printOperandAndMods"; -} - -def InputModsMatchClass : AsmOperandClass { - let Name = "RegWithInputMods"; -} - -def InputModsNoDefault : Operand <i32> { - let PrintMethod = "printOperandAndMods"; - let ParserMatchClass = InputModsMatchClass; -} - class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { int ret = !if (!eq(Src0.Value, untyped.Value), 0, @@ -1050,12 +1157,12 @@ class getVOPSrc0ForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); } -// Returns the register class to use for source 1 of VOP[12C] for the -// given VT. -class getVOPSrc1ForVT<ValueType VT> { +// Returns the vreg register class to use for source operand given VT +class getVregSrcForVT<ValueType VT> { RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); } + // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { @@ -1072,8 +1179,10 @@ class getVOP3SrcForVT<ValueType VT> { // Returns 1 if the source arguments have modifiers, 0 if they do not. // XXX - do f16 instructions? class hasModifiers<ValueType SrcVT> { - bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, - !if(!eq(SrcVT.Value, f64.Value), 1, 0)); + bit ret = + !if(!eq(SrcVT.Value, f32.Value), 1, + !if(!eq(SrcVT.Value, f64.Value), 1, + 0)); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. @@ -1089,11 +1198,15 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, bit HasModifiers> { dag ret = + !if (!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP, V_CLREXCP) + (ins), + /* else */ !if (!eq(NumSrcArgs, 1), !if (!eq(HasModifiers, 1), // VOP1 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - ClampMod:$clamp, omod:$omod) + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + clampmod:$clamp, omod:$omod) /* else */, // VOP1 without modifiers (ins Src0RC:$src0) @@ -1101,9 +1214,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, !if (!eq(NumSrcArgs, 2), !if (!eq(HasModifiers, 1), // VOP 2 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - InputModsNoDefault:$src1_modifiers, Src1RC:$src1, - ClampMod:$clamp, omod:$omod) + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + FPInputMods:$src1_modifiers, Src1RC:$src1, + clampmod:$clamp, omod:$omod) /* else */, // VOP2 without modifiers (ins Src0RC:$src0, Src1RC:$src1) @@ -1111,21 +1224,109 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, /* NumSrcArgs == 3 */, !if (!eq(HasModifiers, 1), // VOP3 with modifiers - (ins InputModsNoDefault:$src0_modifiers, Src0RC:$src0, - InputModsNoDefault:$src1_modifiers, Src1RC:$src1, - InputModsNoDefault:$src2_modifiers, Src2RC:$src2, - ClampMod:$clamp, omod:$omod) + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + FPInputMods:$src1_modifiers, Src1RC:$src1, + FPInputMods:$src2_modifiers, Src2RC:$src2, + clampmod:$clamp, omod:$omod) /* else */, // VOP3 without modifiers (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2) - /* endif */ ))); + /* endif */ )))); +} + +class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, + bit HasModifiers> { + + dag ret = !if (!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl), + !if (!eq(NumSrcArgs, 1), + !if (!eq(HasModifiers, 1), + // VOP1_DPP with modifiers + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + /* else */, + // VOP1_DPP without modifiers + (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + /* endif */) + /* NumSrcArgs == 2 */, + !if (!eq(HasModifiers, 1), + // VOP2_DPP with modifiers + (ins FPInputMods:$src0_modifiers, Src0RC:$src0, + FPInputMods:$src1_modifiers, Src1RC:$src1, + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl) + /* else */, + // VOP2_DPP without modifiers + (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl, + row_mask:$row_mask, bank_mask:$bank_mask, + bound_ctrl:$bound_ctrl) + /* endif */))); +} + +class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs, + bit HasFloatModifiers, ValueType DstVT> { + + dag ret = !if(!eq(NumSrcArgs, 0), + // VOP1 without input operands (V_NOP) + (ins), + !if(!eq(NumSrcArgs, 1), + !if(HasFloatModifiers, + // VOP1_SDWA with float modifiers + (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel) + /* else */, + // VOP1_SDWA with sext modifier + (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel) + /* endif */) + /* NumSrcArgs == 2 */, + !if(HasFloatModifiers, + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA with float modifiers + (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, + FPInputMods:$src1_fmodifiers, Src1RC:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA or VOPC_SDWA with float modifiers + (ins FPInputMods:$src0_fmodifiers, Src0RC:$src0, + FPInputMods:$src1_fmodifiers, Src1RC:$src1, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel) + ), + /* else */ + !if(!eq(DstVT.Size, 1), + // VOPC_SDWA with sext modifiers + (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, + IntInputMods:$src1_imodifiers, Src1RC:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel), + // VOP2_SDWA or VOPC_SDWA with sext modifier + (ins IntInputMods:$src0_imodifiers, Src0RC:$src0, + IntInputMods:$src1_imodifiers, Src1RC:$src1, + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel) + ) + /* endif */))); +} + +// Outs for DPP and SDWA +class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> { + dag ret = !if(HasDst, + !if(!eq(DstVT.Size, 1), + (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions + (outs DstRCDPP:$vdst)), + (outs)); // V_NOP } // Returns the assembly string for the inputs and outputs of a VOP[12C] // instruction. This does not add the _e32 suffix, so it can be reused // by getAsm64. -class getAsm32 <bit HasDst, int NumSrcArgs> { - string dst = "$dst"; +class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> { + string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = ", $src0"; string src1 = ", $src1"; string src2 = ", $src2"; @@ -1137,7 +1338,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs> { // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> { +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { + string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1_modifiers", @@ -1145,8 +1347,71 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> { string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); string ret = !if(!eq(HasModifiers, 0), - getAsm32<HasDst, NumSrcArgs>.ret, - "$dst, "#src0#src1#src2#"$clamp"#"$omod"); + getAsm32<HasDst, NumSrcArgs, DstVT>.ret, + dst#", "#src0#src1#src2#"$clamp"#"$omod"); +} + +class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + "$sdst", + "$vdst"), + ""); // use $sdst for VOPC + string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); + string src1 = !if(!eq(NumSrcArgs, 1), "", + !if(!eq(NumSrcArgs, 2), " $src1_modifiers", + " $src1_modifiers,")); + string args = !if(!eq(HasModifiers, 0), + getAsm32<0, NumSrcArgs, DstVT>.ret, + ", "#src0#src1); + string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl"; +} + +class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers, + ValueType DstVT = i32> { + string dst = !if(HasDst, + !if(!eq(DstVT.Size, 1), + " vcc", // use vcc token as dst for VOPC instructioins + "$vdst"), + ""); + string src0 = !if(HasFloatModifiers, "$src0_fmodifiers", "$src0_imodifiers"); + string src1 = !if(HasFloatModifiers, "$src1_fmodifiers", "$src1_imodifiers"); + string args = !if(!eq(NumSrcArgs, 0), + "", + !if(!eq(NumSrcArgs, 1), + ", "#src0#"$clamp", + ", "#src0#", "#src1#"$clamp" + ) + ); + string sdwa = !if(!eq(NumSrcArgs, 0), + "", + !if(!eq(NumSrcArgs, 1), + " $dst_sel $dst_unused $src0_sel", + !if(!eq(DstVT.Size, 1), + " $src0_sel $src1_sel", // No dst_sel and dst_unused for VOPC + " $dst_sel $dst_unused $src0_sel $src1_sel" + ) + ) + ); + string ret = dst#args#sdwa; +} + +// Function that checks if instruction supports DPP and SDWA +class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32, + ValueType Src1VT = i32> { + bit ret = !if(!eq(NumSrcArgs, 3), + 0, // NumSrcArgs == 3 - No DPP or SDWA for VOP3 + !if(!eq(DstVT.Size, 64), + 0, // 64-bit dst - No DPP or SDWA for 64-bit operands + !if(!eq(Src0VT.Size, 64), + 0, // 64-bit src0 + !if(!eq(Src0VT.Size, 64), + 0, // 64-bit src2 + 1 + ) + ) + ) + ); } class VOPProfile <list<ValueType> _ArgVT> { @@ -1158,30 +1423,48 @@ class VOPProfile <list<ValueType> _ArgVT> { field ValueType Src1VT = ArgVT[2]; field ValueType Src2VT = ArgVT[3]; field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret; + field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret; field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret; - field RegisterClass Src1RC32 = getVOPSrc1ForVT<Src1VT>.ret; + field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret; field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret; field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; + field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret; + field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret; + field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret; + field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret; field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); field bit HasDst32 = HasDst; field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; field bit HasModifiers = hasModifiers<Src0VT>.ret; - field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs)); + field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; + + field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs)); // VOP3b instructions are a special case with a second explicit // output. This is manually overridden for them. field dag Outs32 = Outs; field dag Outs64 = Outs; + field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; + field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCDPP>.ret; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasModifiers>.ret; + field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs, HasModifiers>.ret; + field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs, HasModifiers, DstVT>.ret; + + field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; + field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret; +} - field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret; - field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret; +class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> { + let HasExt = 0; } // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order @@ -1194,6 +1477,9 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; +def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; + def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; @@ -1216,10 +1502,10 @@ def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; // Write out to vcc or arbitrary SGPR. def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { - let Asm32 = "$dst, vcc, $src0, $src1"; - let Asm64 = "$dst, $sdst, $src0, $src1"; - let Outs32 = (outs DstRC:$dst); - let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + let Asm32 = "$vdst, vcc, $src0, $src1"; + let Asm64 = "$vdst, $sdst, $src0, $src1"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); } // Write out to vcc or arbitrary SGPR and read in from vcc or @@ -1231,10 +1517,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { // restriction. SGPRs are still allowed because it should // technically be possible to use VCC again as src0. let Src0RC32 = VCSrc_32; - let Asm32 = "$dst, vcc, $src0, $src1, vcc"; - let Asm64 = "$dst, $sdst, $src0, $src1, $src2"; - let Outs32 = (outs DstRC:$dst); - let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + let Asm32 = "$vdst, vcc, $src0, $src1, vcc"; + let Asm64 = "$vdst, $sdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); +} + +// Read in from vcc or arbitrary SGPR +def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + let Src0RC32 = VCSrc_32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above. + let Asm32 = "$vdst, $src0, $src1, vcc"; + let Asm64 = "$vdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$vdst); + let Outs64 = (outs DstRC:$vdst); // Suppress src2 implied by type since the 32-bit encoding uses an // implicit VCC use. @@ -1263,11 +1562,17 @@ class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, v let Asm32 = "vcc, $src0, $src1"; // The destination for 32-bit encoding is implicit. let HasDst32 = 0; + let Outs64 = (outs DstRC:$sdst); } class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; + let Ins64 = (ins FPInputMods:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); + let Asm64 = "$sdst, $src0_modifiers, $src1"; + let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC64:$src0, + IntInputMods:$src1_imodifiers, Src1RC64:$src1, + clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel); + let AsmSDWA = " vcc, $src0_fmodifiers, $src1_imodifiers$clamp $src0_sel $src1_sel"; + } def VOPC_I1_F32_F32 : VOPC_Profile<f32>; @@ -1281,28 +1586,42 @@ def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; -def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); - let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); - let Asm64 = "$dst, $src0, $src1, $src2"; -} def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>; -def VOP_MADK : VOPProfile <[f32, f32, f32, f32]> { - field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); - field string Asm = "$dst, $src0, $vsrc1, $src2"; +def VOP_MADAK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins32 = (ins VCSrc_32:$src0, VGPR_32:$src1, u32kimm:$imm); + field string Asm32 = "$vdst, $src0, $src1, $imm"; + field bit HasExt = 0; +} +def VOP_MADMK : VOPProfile <[f32, f32, f32, f32]> { + field dag Ins32 = (ins VCSrc_32:$src0, u32kimm:$imm, VGPR_32:$src1); + field string Asm32 = "$vdst, $src0, $imm, $src1"; + field bit HasExt = 0; } def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, HasModifiers>.ret; - let Asm32 = getAsm32<1, 2>.ret; - let Asm64 = getAsm64<1, 2, HasModifiers>.ret; + let InsDPP = (ins FPInputMods:$src0_modifiers, Src0RC32:$src0, + FPInputMods:$src1_modifiers, Src1RC32:$src1, + VGPR_32:$src2, // stub argument + dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, + bank_mask:$bank_mask, bound_ctrl:$bound_ctrl); + let InsSDWA = (ins FPInputMods:$src0_fmodifiers, Src0RC32:$src0, + FPInputMods:$src1_fmodifiers, Src1RC32:$src1, + VGPR_32:$src2, // stub argument + clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused, + src0_sel:$src0_sel, src1_sel:$src1_sel); + let Asm32 = getAsm32<1, 2, f32>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret; + let AsmDPP = getAsmDPP<1, 2, HasModifiers, f32>.ret; + let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, f32>.ret; } def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; +// This class is used only with VOPC instructions. Use $sdst for out operand class SIInstAlias <string asm, Instruction inst, VOPProfile p> : InstAlias <asm, (inst)>, PredicateControl { @@ -1313,13 +1632,13 @@ class SIInstAlias <string asm, Instruction inst, VOPProfile p> : !if (p.HasDst32, !if (!eq(p.NumSrcArgs, 0), // 1 dst, 0 src - (inst p.DstRC:$dst), + (inst p.DstRC:$sdst), !if (!eq(p.NumSrcArgs, 1), // 1 dst, 1 src - (inst p.DstRC:$dst, p.Src0RC32:$src0), + (inst p.DstRC:$sdst, p.Src0RC32:$src0), !if (!eq(p.NumSrcArgs, 2), // 1 dst, 2 src - (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1), + (inst p.DstRC:$sdst, p.Src0RC32:$src0, p.Src1RC32:$src1), // else - unreachable (inst)))), // else @@ -1368,7 +1687,7 @@ class AtomicNoRet <string noRetOp, bit isRet> { class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP1Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr <opName#"_e32", SISubtarget.NONE>, + SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>, MnemonicAlias<opName#"_e32", opName> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -1379,14 +1698,18 @@ class VOP1_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : class VOP1_Real_si <string opName, vop1 op, dag outs, dag ins, string asm> : VOP1<op.SI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI> { + SIMCInstr <opName#"_e32", SIEncodingFamily.SI> { let AssemblerPredicate = SIAssemblerPredicate; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : VOP1<op.VI, outs, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI> { + SIMCInstr <opName#"_e32", SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, @@ -1399,6 +1722,49 @@ multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, } +class VOP1_DPP <vop1 op, string opName, VOPProfile p> : + VOP1_DPPe <op.VI>, + VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> { + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "DPP"; + let DisableDecoder = DisableVIDecoder; + let src0_modifiers = !if(p.HasModifiers, ?, 0); + let src1_modifiers = 0; +} + +class SDWADisableFields <VOPProfile p> { + bits<8> src0 = !if(!eq(p.NumSrcArgs, 0), 0, ?); + bits<3> src0_sel = !if(!eq(p.NumSrcArgs, 0), 6, ?); + bits<2> src0_fmodifiers = !if(!eq(p.NumSrcArgs, 0), + 0, + !if(p.HasModifiers, ?, 0)); + bits<1> src0_imodifiers = !if(!eq(p.NumSrcArgs, 0), + 0, + !if(p.HasModifiers, 0, ?)); + bits<3> src1_sel = !if(!eq(p.NumSrcArgs, 0), 6, + !if(!eq(p.NumSrcArgs, 1), 6, + ?)); + bits<2> src1_fmodifiers = !if(!eq(p.NumSrcArgs, 0), 0, + !if(!eq(p.NumSrcArgs, 1), 0, + !if(p.HasModifiers, ?, 0))); + bits<1> src1_imodifiers = !if(!eq(p.NumSrcArgs, 0), 0, + !if(!eq(p.NumSrcArgs, 1), 0, + !if(p.HasModifiers, 0, ?))); + bits<3> dst_sel = !if(p.HasDst, ?, 6); + bits<2> dst_unused = !if(p.HasDst, ?, 2); + bits<1> clamp = !if(!eq(p.NumSrcArgs, 0), 0, ?); +} + +class VOP1_SDWA <vop1 op, string opName, VOPProfile p> : + VOP1_SDWAe <op.VI>, + VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>, + SDWADisableFields <p> { + let AsmMatchConverter = "cvtSdwaVOP1"; + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "SDWA"; + let DisableDecoder = DisableVIDecoder; +} + multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, string asm = opName#p.Asm32> { @@ -1410,7 +1776,7 @@ multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : VOP2Common <outs, ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e32", SISubtarget.NONE>, + SIMCInstr<opName#"_e32", SIEncodingFamily.NONE>, MnemonicAlias<opName#"_e32", opName> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -1418,14 +1784,18 @@ class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : class VOP2_Real_si <string opName, vop2 op, dag outs, dag ins, string asm> : VOP2 <op.SI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI> { + SIMCInstr <opName#"_e32", SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : VOP2 <op.VI, outs, ins, opName#asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI> { + SIMCInstr <opName#"_e32", SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern, @@ -1449,6 +1819,26 @@ multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern, } +class VOP2_DPP <vop2 op, string opName, VOPProfile p> : + VOP2_DPPe <op.VI>, + VOP_DPP <p.OutsDPP, p.InsDPP, opName#p.AsmDPP, [], p.HasModifiers> { + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "DPP"; + let DisableDecoder = DisableVIDecoder; + let src0_modifiers = !if(p.HasModifiers, ?, 0); + let src1_modifiers = !if(p.HasModifiers, ?, 0); +} + +class VOP2_SDWA <vop2 op, string opName, VOPProfile p> : + VOP2_SDWAe <op.VI>, + VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>, + SDWADisableFields <p> { + let AsmMatchConverter = "cvtSdwaVOP2"; + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "SDWA"; + let DisableDecoder = DisableVIDecoder; +} + class VOP3DisableFields <bit HasSrc1, bit HasSrc2, bit HasModifiers> { bits<2> src0_modifiers = !if(HasModifiers, ?, 0); @@ -1471,10 +1861,11 @@ class VOP3DisableModFields <bit HasSrc0Mods, bits<1> clamp = !if(HasOutputMods, ?, 0); } -class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : - VOP3Common <outs, ins, "", pattern>, +class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, "", pattern, HasMods, VOP3Only>, VOP <opName>, - SIMCInstr<opName#"_e64", SISubtarget.NONE>, + SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>, MnemonicAlias<opName#"_e64", opName> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -1483,44 +1874,96 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : field bit src0; } -class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : - VOP3Common <outs, ins, asm, []>, +class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, VOP3e <op>, - SIMCInstr<opName#"_e64", SISubtarget.SI> { + SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } -class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : - VOP3Common <outs, ins, asm, []>, +class VOP3_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, VOP3e_vi <op>, - SIMCInstr <opName#"_e64", SISubtarget.VI> { + SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} + +class VOP3_C_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, + VOP3ce <op>, + SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class VOP3_C_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, + VOP3ce_vi <op>, + SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } -class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : - VOP3Common <outs, ins, asm, []>, +class VOP3b_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, VOP3be <op>, - SIMCInstr<opName#"_e64", SISubtarget.SI> { + SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } -class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName> : - VOP3Common <outs, ins, asm, []>, +class VOP3b_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, VOP3be_vi <op>, - SIMCInstr <opName#"_e64", SISubtarget.VI> { + SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} + +class VOP3e_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, + VOP3e <op>, + SIMCInstr<opName#"_e64", SIEncodingFamily.SI> { + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class VOP3e_Real_vi <bits<10> op, dag outs, dag ins, string asm, string opName, + bit HasMods = 0, bit VOP3Only = 0> : + VOP3Common <outs, ins, asm, [], HasMods, VOP3Only>, + VOP3e_vi <op>, + SIMCInstr <opName#"_e64", SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { + string opName, int NumSrcArgs, bit HasMods = 1, bit VOP3Only = 0> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>, VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), !if(!eq(NumSrcArgs, 2), 0, 1), HasMods>; - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>, VOP3DisableFields<!if(!eq(NumSrcArgs, 1), 0, 1), !if(!eq(NumSrcArgs, 2), 0, 1), HasMods>; @@ -1529,21 +1972,21 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, bit HasMods = 1> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>; - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<0, 0, HasMods>; - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<0, 0, HasMods>; } multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, bit HasMods = 1> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>; - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<0, 0, HasMods>; // No VI instruction. This class is for SI only. } @@ -1552,13 +1995,13 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, bit HasMods = 1> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<1, 0, HasMods>; - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<1, 0, HasMods>; } @@ -1566,10 +2009,10 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, bit HasMods = 1> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<1, 0, HasMods>; // No VI instruction. This class is for SI only. @@ -1579,13 +2022,26 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, // instead of an implicit VCC as in the VOP2b format. multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit useSrc2Input = 0> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; + bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>; + + def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>, + VOP3DisableFields<1, useSrc2Input, HasMods>; + + def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>, + VOP3DisableFields<1, useSrc2Input, HasMods>; +} - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, +// Same as VOP3b_2_3_m but no 2nd destination (sdst), e.g. v_cndmask_b32. +multiclass VOP3e_2_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit useSrc2Input = 0, bit VOP3Only = 0> { + def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods, VOP3Only>; + + def _si : VOP3e_Real_si <op.SI3, outs, ins, asm, opName, HasMods, VOP3Only>, VOP3DisableFields<1, useSrc2Input, HasMods>; - def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, + def _vi : VOP3e_Real_vi <op.VI3, outs, ins, asm, opName, HasMods, VOP3Only>, VOP3DisableFields<1, useSrc2Input, HasMods>; } @@ -1594,19 +2050,19 @@ multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, bit HasMods, bit defExec, string revOp, list<SchedReadWrite> sched> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>, + def "" : VOP3_Pseudo <outs, ins, pattern, opName, HasMods>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { let Defs = !if(defExec, [EXEC], []); let SchedRW = sched; } - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, + def _si : VOP3_C_Real_si <op.SI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); let SchedRW = sched; } - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, + def _vi : VOP3_C_Real_vi <op.VI3, outs, ins, asm, opName, HasMods>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); let SchedRW = sched; @@ -1618,19 +2074,23 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, string asm, list<dag> pattern = []> { let isPseudo = 1, isCodeGenOnly = 1 in { def "" : VOPAnyCommon <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE>; + SIMCInstr<opName, SIEncodingFamily.NONE>; } def _si : VOP2 <op.SI3{5-0}, outs, ins, asm, []>, - SIMCInstr <opName, SISubtarget.SI> { + SIMCInstr <opName, SIEncodingFamily.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } def _vi : VOP3Common <outs, ins, asm, []>, VOP3e_vi <op.VI3>, VOP3DisableFields <1, 0, 0>, - SIMCInstr <opName, SISubtarget.VI> { + SIMCInstr <opName, SIEncodingFamily.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } } @@ -1641,15 +2101,19 @@ multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32, defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, p.HasModifiers>; + + def _dpp : VOP1_DPP <op, opName, p>; + + def _sdwa : VOP1_SDWA <op, opName, p>; } multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP1_Helper < op, opName, P, [], !if(P.HasModifiers, - [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]) + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]) >; multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, @@ -1659,9 +2123,9 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, - [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]), opName, P.HasModifiers>; } @@ -1672,6 +2136,10 @@ multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32, defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, revOp, p.HasModifiers>; + + def _dpp : VOP2_DPP <op, opName, p>; + + def _sdwa : VOP2_SDWA <op, opName, p>; } multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, @@ -1679,11 +2147,11 @@ multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, string revOp = opName> : VOP2_Helper < op, opName, P, [], !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), revOp >; @@ -1695,14 +2163,41 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), opName, revOp, P.HasModifiers>; } +multiclass VOP2e_Helper <vop2 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, + string revOp, bit useSGPRInput> { + + let SchedRW = [Write32Bit] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in { + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + } + + defm _e64 : VOP3e_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64, + opName, revOp, p.HasModifiers, useSGPRInput>; + } +} + +multiclass VOP2eInst <vop2 op, string opName, VOPProfile P, + SDPatternOperator node = null_frag, + string revOp = opName> : VOP2e_Helper < + op, opName, P, [], + !if(P.HasModifiers, + [(set P.DstVT:$vdst, + (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, + i1:$clamp, i32:$omod)), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + revOp, !eq(P.NumSrcArgs, 3) +>; + multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32, list<dag> pat64, string revOp, bit useSGPRInput> { @@ -1722,11 +2217,11 @@ multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, string revOp = opName> : VOP2b_Helper < op, opName, P, [], !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), revOp, !eq(P.NumSrcArgs, 3) >; @@ -1746,31 +2241,35 @@ multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, : VOP2_VI3_Helper < op, opName, P, [], !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), revOp >; -multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { +multiclass VOP2MADK <vop2 op, string opName, VOPProfile P, list<dag> pattern = []> { - def "" : VOP2_Pseudo <VOP_MADK.Outs, VOP_MADK.Ins, pattern, opName>; + def "" : VOP2_Pseudo <P.Outs, P.Ins32, pattern, opName>; let isCodeGenOnly = 0 in { - def _si : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, - !strconcat(opName, VOP_MADK.Asm), []>, - SIMCInstr <opName#"_e32", SISubtarget.SI>, + def _si : VOP2Common <P.Outs, P.Ins32, + !strconcat(opName, P.Asm32), []>, + SIMCInstr <opName#"_e32", SIEncodingFamily.SI>, VOP2_MADKe <op.SI> { let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } - def _vi : VOP2Common <VOP_MADK.Outs, VOP_MADK.Ins, - !strconcat(opName, VOP_MADK.Asm), []>, - SIMCInstr <opName#"_e32", SISubtarget.VI>, + def _vi : VOP2Common <P.Outs, P.Ins32, + !strconcat(opName, P.Asm32), []>, + SIMCInstr <opName#"_e32", SIEncodingFamily.VI>, VOP2_MADKe <op.VI> { let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } } // End isCodeGenOnly = 0 } @@ -1778,37 +2277,55 @@ let isCodeGenOnly = 0 in { class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> : VOPCCommon <ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e32", SISubtarget.NONE> { + SIMCInstr<opName#"_e32", SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } +class VOPC_SDWA <vopc op, string opName, bit DefExec, VOPProfile p> : + VOPC_SDWAe <op.VI>, + VOP_SDWA <p.OutsSDWA, p.InsSDWA, opName#p.AsmSDWA, [], p.HasModifiers>, + SDWADisableFields <p> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let AsmMatchConverter = "cvtSdwaVOPC"; + let AssemblerPredicates = !if(p.HasExt, [isVI], [DisableInst]); + let DecoderNamespace = "SDWA"; + let DisableDecoder = DisableVIDecoder; +} + multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern, string opName, bit DefExec, VOPProfile p, list<SchedReadWrite> sched, string revOpName = "", string asm = opName#"_e32 "#op_asm, string alias_asm = opName#" "#op_asm> { - def "" : VOPC_Pseudo <ins, pattern, opName> { + def "" : VOPC_Pseudo <ins, pattern, opName>, + VOP2_REV<revOpName#"_e32", !eq(revOpName, opName)> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); let SchedRW = sched; + let isConvergent = DefExec; } let AssemblerPredicates = [isSICI] in { def _si : VOPC<op.SI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI> { + SIMCInstr <opName#"_e32", SIEncodingFamily.SI> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; + let isConvergent = DefExec; let SchedRW = sched; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; } } // End AssemblerPredicates = [isSICI] let AssemblerPredicates = [isVI] in { def _vi : VOPC<op.VI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI> { + SIMCInstr <opName#"_e32", SIEncodingFamily.VI> { let Defs = !if(DefExec, [VCC, EXEC], [VCC]); - let hasSideEffects = DefExec; + let isConvergent = DefExec; let SchedRW = sched; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; } } // End AssemblerPredicates = [isVI] @@ -1819,10 +2336,13 @@ multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern, multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32, list<dag> pat64, bit DefExec, string revOp, VOPProfile p, list<SchedReadWrite> sched> { - defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched, + revOp>; - defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64, opName, p.HasModifiers, DefExec, revOp, sched>; + + def _sdwa : VOPC_SDWA <op, opName, DefExec, p>; } // Special case for class instructions which only have modifiers on @@ -1832,9 +2352,14 @@ multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32, VOPProfile p, list<SchedReadWrite> sched> { defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; - defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$sdst), p.Ins64, opName#p.Asm64, pat64, opName, p.HasModifiers, DefExec, revOp, sched>, VOP3DisableModFields<1, 0, 0>; + + def _sdwa : VOPC_SDWA <op, opName, DefExec, p> { + let src1_fmodifiers = 0; + let src1_imodifiers = ?; + } } multiclass VOPCInst <vopc op, string opName, @@ -1845,12 +2370,12 @@ multiclass VOPCInst <vopc op, string opName, VOPC_Helper < op, opName, [], !if(P.HasModifiers, - [(set i1:$dst, + [(set i1:$sdst, (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), cond))], - [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), + [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), DefExec, revOp, P, sched >; @@ -1859,9 +2384,9 @@ multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, list<SchedReadWrite> sched> : VOPC_Class_Helper < op, opName, [], !if(P.HasModifiers, - [(set i1:$dst, + [(set i1:$sdst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], - [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), + [(set i1:$sdst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), DefExec, opName, P, sched >; @@ -1897,10 +2422,6 @@ multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>; -multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, - list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < - op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods ->; multiclass VOPC_CLASS_F32 <vopc op, string opName> : VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>; @@ -1914,32 +2435,40 @@ multiclass VOPC_CLASS_F64 <vopc op, string opName> : multiclass VOPCX_CLASS_F64 <vopc op, string opName> : VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>; + +multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, + list<dag> pat, int NumSrcArgs, bit HasMods, + bit VOP3Only = 0> : VOP3_m < + op, outs, ins, opName#" "#asm, pat, opName, NumSrcArgs, HasMods, VOP3Only +>; + multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, - SDPatternOperator node = null_frag> : VOP3_Helper < - op, opName, (outs P.DstRC.RegClass:$dst), P.Ins64, P.Asm64, + SDPatternOperator node = null_frag, bit VOP3Only = 0> : + VOP3_Helper < + op, opName, (outs P.DstRC.RegClass:$vdst), P.Ins64, P.Asm64, !if(!eq(P.NumSrcArgs, 3), !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1, + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))]), !if(!eq(P.NumSrcArgs, 2), !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) + [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]) /* P.NumSrcArgs == 1 */, !if(P.HasModifiers, - [(set P.DstVT:$dst, + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]))), - P.NumSrcArgs, P.HasModifiers + [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]))), + P.NumSrcArgs, P.HasModifiers, VOP3Only >; // Special case for v_div_fmas_{f32|f64}, since it seems to be the @@ -1948,14 +2477,14 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < op, opName, - (outs P.DstRC.RegClass:$dst), - (ins InputModsNoDefault:$src0_modifiers, P.Src0RC64:$src0, - InputModsNoDefault:$src1_modifiers, P.Src1RC64:$src1, - InputModsNoDefault:$src2_modifiers, P.Src2RC64:$src2, - ClampMod:$clamp, + (outs P.DstRC.RegClass:$vdst), + (ins FPInputMods:$src0_modifiers, P.Src0RC64:$src0, + FPInputMods:$src1_modifiers, P.Src1RC64:$src1, + FPInputMods:$src2_modifiers, P.Src2RC64:$src2, + clampmod:$clamp, omod:$omod), - "$dst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", - [(set P.DstVT:$dst, + "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", + [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), @@ -1964,11 +2493,11 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName, 3, 1 >; -multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> : +multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> : VOP3b_2_3_m < op, P.Outs64, P.Ins64, opName#" "#P.Asm64, pattern, - opName, "", 1, 1 + opName, "", 1, 1, VOP3Only >; class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< @@ -1987,7 +2516,7 @@ class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< class VINTRP_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : VINTRPCommon <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -1996,13 +2525,21 @@ class VINTRP_Real_si <bits <2> op, string opName, dag outs, dag ins, string asm> : VINTRPCommon <outs, ins, asm, []>, VINTRPe <op>, - SIMCInstr<opName, SISubtarget.SI>; + SIMCInstr<opName, SIEncodingFamily.SI> { + let AssemblerPredicate = SIAssemblerPredicate; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} class VINTRP_Real_vi <bits <2> op, string opName, dag outs, dag ins, string asm> : VINTRPCommon <outs, ins, asm, []>, VINTRPe_vi <op>, - SIMCInstr<opName, SISubtarget.VI>; + SIMCInstr<opName, SIEncodingFamily.VI> { + let AssemblerPredicate = VIAssemblerPredicate; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern = []> { @@ -2019,7 +2556,7 @@ multiclass VINTRP_m <bits <2> op, dag outs, dag ins, string asm, class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : DS <outs, ins, "", pattern>, - SIMCInstr <opName, SISubtarget.NONE> { + SIMCInstr <opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -2027,14 +2564,22 @@ class DS_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : class DS_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : DS <outs, ins, asm, []>, DSe <op>, - SIMCInstr <opName, SISubtarget.SI> { + SIMCInstr <opName, SIEncodingFamily.SI> { let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; } class DS_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : DS <outs, ins, asm, []>, DSe_vi <op>, - SIMCInstr <opName, SISubtarget.VI>; + SIMCInstr <opName, SIEncodingFamily.VI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; +} class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm> : DS_Real_si <op,opName, outs, ins, asm> { @@ -2043,7 +2588,6 @@ class DS_Off16_Real_si <bits<8> op, string opName, dag outs, dag ins, string asm bits<16> offset; let offset0 = offset{7-0}; let offset1 = offset{15-8}; - let isCodeGenOnly = 0; } class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm> : @@ -2055,9 +2599,24 @@ class DS_Off16_Real_vi <bits<8> op, string opName, dag outs, dag ins, string asm let offset1 = offset{15-8}; } +multiclass DS_1A_RET_ <dsop op, string opName, RegisterClass rc, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), + string asm = opName#" $vdst, $addr"#"$offset$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0 in { + def _si : DS_Off16_Real_si <op.SI, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op.VI, opName, outs, ins, asm>; + } +} + +// TODO: DS_1A_RET can be inherited from DS_1A_RET_ but its not working +// for some reason. In fact we can remove this class if use dsop everywhere multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr"#"$offset$gds"> { def "" : DS_Pseudo <opName, outs, ins, []>; @@ -2070,8 +2629,8 @@ multiclass DS_1A_RET <bits<8> op, string opName, RegisterClass rc, multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset0:$offset0, ds_offset1:$offset1, - gds01:$gds), + dag ins = (ins VGPR_32:$addr, offset0:$offset0, offset1:$offset1, + gds:$gds), string asm = opName#" $vdst, $addr"#"$offset0"#"$offset1$gds"> { def "" : DS_Pseudo <opName, outs, ins, []>; @@ -2084,7 +2643,7 @@ multiclass DS_1A_Off8_RET <bits<8> op, string opName, RegisterClass rc, multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc, dag outs = (outs), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), string asm = opName#" $addr, $data0"#"$offset$gds"> { def "" : DS_Pseudo <opName, outs, ins, []>, @@ -2096,11 +2655,25 @@ multiclass DS_1A1D_NORET <bits<8> op, string opName, RegisterClass rc, } } -multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, +multiclass DS_1A_Off8_NORET <bits<8> op, string opName, + dag outs = (outs), + dag ins = (ins VGPR_32:$addr, + offset0:$offset0, offset1:$offset1, gds:$gds), + string asm = opName#" $addr $offset0"#"$offset1$gds"> { + + def "" : DS_Pseudo <opName, outs, ins, []>; + + let data0 = 0, data1 = 0, vdst = 0, AsmMatchConverter = "cvtDSOffset01" in { + def _si : DS_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Real_vi <op, opName, outs, ins, asm>; + } +} + +multiclass DS_1A2D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, dag outs = (outs), dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset0:$offset0, ds_offset1:$offset1, gds01:$gds), - string asm = opName#" $addr, $data0, $data1"#"$offset0"#"$offset1"#"$gds"> { + offset0:$offset0, offset1:$offset1, gds:$gds), + string asm = opName#" $addr, $data0, $data1$offset0$offset1$gds"> { def "" : DS_Pseudo <opName, outs, ins, []>; @@ -2113,7 +2686,7 @@ multiclass DS_1A1D_Off8_NORET <bits<8> op, string opName, RegisterClass rc, multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, string noRetOp = "", dag outs = (outs rc:$vdst), - dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { let hasPostISelHook = 1 in { @@ -2127,6 +2700,23 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, } } +multiclass DS_1A1D_PERMUTE <bits<8> op, string opName, RegisterClass rc, + SDPatternOperator node = null_frag, + dag outs = (outs rc:$vdst), + dag ins = (ins VGPR_32:$addr, rc:$data0, offset:$offset), + string asm = opName#" $vdst, $addr, $data0"#"$offset"> { + + let mayLoad = 0, mayStore = 0, isConvergent = 1 in { + def "" : DS_Pseudo <opName, outs, ins, + [(set i32:$vdst, + (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))]>; + + let data1 = 0, gds = 0 in { + def "_vi" : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } + } +} + multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, string noRetOp = "", dag ins, dag outs = (outs rc:$vdst), @@ -2145,14 +2735,14 @@ multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, string noRetOp = "", RegisterClass src = rc> : DS_1A2D_RET_m <op, asm, rc, noRetOp, (ins VGPR_32:$addr, src:$data0, src:$data1, - ds_offset:$offset, gds:$gds) + offset:$offset, gds:$gds) >; multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc, string noRetOp = opName, dag outs = (outs), dag ins = (ins VGPR_32:$addr, rc:$data0, rc:$data1, - ds_offset:$offset, gds:$gds), + offset:$offset, gds:$gds), string asm = opName#" $addr, $data0, $data1"#"$offset"#"$gds"> { def "" : DS_Pseudo <opName, outs, ins, []>, @@ -2166,7 +2756,7 @@ multiclass DS_1A2D_NORET <bits<8> op, string opName, RegisterClass rc, multiclass DS_0A_RET <bits<8> op, string opName, dag outs = (outs VGPR_32:$vdst), - dag ins = (ins ds_offset:$offset, gds:$gds), + dag ins = (ins offset:$offset, gds:$gds), string asm = opName#" $vdst"#"$offset"#"$gds"> { let mayLoad = 1, mayStore = 1 in { @@ -2181,7 +2771,7 @@ multiclass DS_0A_RET <bits<8> op, string opName, multiclass DS_1A_RET_GDS <bits<8> op, string opName, dag outs = (outs VGPR_32:$vdst), - dag ins = (ins VGPR_32:$addr, ds_offset_gds:$offset), + dag ins = (ins VGPR_32:$addr, offset:$offset), string asm = opName#" $vdst, $addr"#"$offset gds"> { def "" : DS_Pseudo <opName, outs, ins, []>; @@ -2207,7 +2797,7 @@ multiclass DS_1A_GDS <bits<8> op, string opName, multiclass DS_1A <bits<8> op, string opName, dag outs = (outs), - dag ins = (ins VGPR_32:$addr, ds_offset:$offset, gds:$gds), + dag ins = (ins VGPR_32:$addr, offset:$offset, gds:$gds), string asm = opName#" $addr"#"$offset"#"$gds"> { let mayLoad = 1, mayStore = 1 in { @@ -2226,7 +2816,7 @@ multiclass DS_1A <bits<8> op, string opName, class MTBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : MTBUF <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -2235,12 +2825,18 @@ class MTBUF_Real_si <bits<3> op, string opName, dag outs, dag ins, string asm> : MTBUF <outs, ins, asm, []>, MTBUFe <op>, - SIMCInstr<opName, SISubtarget.SI>; + SIMCInstr<opName, SIEncodingFamily.SI> { + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; +} class MTBUF_Real_vi <bits<4> op, string opName, dag outs, dag ins, string asm> : MTBUF <outs, ins, asm, []>, MTBUFe_vi <op>, - SIMCInstr <opName, SISubtarget.VI>; + SIMCInstr <opName, SIEncodingFamily.VI> { + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; +} multiclass MTBUF_m <bits<3> op, string opName, dag outs, dag ins, string asm, list<dag> pattern> { @@ -2311,7 +2907,7 @@ class MUBUFAddr64Table <bit is_addr64, string suffix = ""> { class MUBUF_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : MUBUF <outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; @@ -2329,16 +2925,22 @@ class MUBUF_Real_si <mubuf op, string opName, dag outs, dag ins, string asm> : MUBUF <outs, ins, asm, []>, MUBUFe <op.SI>, - SIMCInstr<opName, SISubtarget.SI> { + SIMCInstr<opName, SIEncodingFamily.SI> { let lds = 0; + let AssemblerPredicate = SIAssemblerPredicate; + let DecoderNamespace="SICI"; + let DisableDecoder = DisableSIDecoder; } class MUBUF_Real_vi <mubuf op, string opName, dag outs, dag ins, string asm> : MUBUF <outs, ins, asm, []>, MUBUFe_vi <op.VI>, - SIMCInstr<opName, SISubtarget.VI> { + SIMCInstr<opName, SIEncodingFamily.VI> { let lds = 0; + let AssemblerPredicate = VIAssemblerPredicate; + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; } multiclass MUBUF_m <mubuf op, string opName, dag outs, dag ins, string asm, @@ -2399,38 +3001,82 @@ multiclass MUBUFAtomicAddr64_m <mubuf op, string opName, dag outs, dag ins, // for VI appropriately. } +multiclass MUBUFAtomicOther_m <mubuf op, string opName, dag outs, dag ins, + string asm, list<dag> pattern, bit is_return> { + + def "" : MUBUF_Pseudo <opName, outs, ins, pattern>, + AtomicNoRet<opName, is_return>; + + let tfe = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, outs, ins, asm>; + } + + def _vi : MUBUF_Real_vi <op, opName, outs, ins, asm>; + } +} + multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, ValueType vt, SDPatternOperator atomic> { - let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { + let mayStore = 1, mayLoad = 1, hasPostISelHook = 1, hasSideEffects = 1 in { // No return variants - let glc = 0 in { + let glc = 0, AsmMatchConverter = "cvtMubufAtomic" in { defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 + SCSrc_32:$soffset, offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$slc", [], 0 >; defm _OFFSET : MUBUFAtomicOffset_m < op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 + name#" $vdata, off, $srsrc, $soffset$offset$slc", [], 0 >; + + let offen = 1, idxen = 0 in { + defm _OFFEN : MUBUFAtomicOther_m < + op, name#"_offen", (outs), + (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$slc", [], 0 + >; + } + + let offen = 0, idxen = 1 in { + defm _IDXEN : MUBUFAtomicOther_m < + op, name#"_idxen", (outs), + (ins rc:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$slc", [], 0 + >; + } + + let offen = 1, idxen = 1 in { + defm _BOTHEN : MUBUFAtomicOther_m < + op, name#"_bothen", (outs), + (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$slc", + [], 0 + >; + } } // glc = 0 // Variant that return values let glc = 1, Constraints = "$vdata = $vdata_in", + AsmMatchConverter = "cvtMubufAtomicReturn", DisableEncoding = "$vdata_in" in { defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < op, name#"_rtn_addr64", (outs rc:$vdata), (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", + SCSrc_32:$soffset, offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset glc$slc", [(set vt:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$vdata_in))], 1 @@ -2439,13 +3085,42 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _RTN_OFFSET : MUBUFAtomicOffset_m < op, name#"_rtn_offset", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc", + offset:$offset, slc:$slc), + name#" $vdata, off, $srsrc, $soffset$offset glc$slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), vt:$vdata_in))], 1 >; + let offen = 1, idxen = 0 in { + defm _RTN_OFFEN : MUBUFAtomicOther_m < + op, name#"_rtn_offen", (outs rc:$vdata), + (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset offen$offset glc$slc", + [], 1 + >; + } + + let offen = 0, idxen = 1 in { + defm _RTN_IDXEN : MUBUFAtomicOther_m < + op, name#"_rtn_idxen", (outs rc:$vdata), + (ins rc:$vdata_in, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset glc$slc", + [], 1 + >; + } + + let offen = 1, idxen = 1 in { + defm _RTN_BOTHEN : MUBUFAtomicOther_m < + op, name#"_rtn_bothen", (outs rc:$vdata), + (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, + offset:$offset, slc:$slc), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset glc$slc", + [], 1 + >; + } } // glc = 1 } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 @@ -2461,8 +3136,8 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let offen = 0, idxen = 0, vaddr = 0 in { defm _OFFSET : MUBUF_m <op, name#"_offset", (outs regClass:$vdata), (ins SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe", [(set load_vt:$vdata, (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe)))]>; @@ -2471,33 +3146,32 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, let offen = 1, idxen = 0 in { defm _OFFEN : MUBUF_m <op, name#"_offen", (outs regClass:$vdata), (ins VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, slc:$slc, + SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + name#" $vdata, $vaddr, $srsrc, $soffset offen$offset$glc$slc$tfe", []>; } let offen = 0, idxen = 1 in { defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs regClass:$vdata), (ins VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>; } let offen = 1, idxen = 1 in { defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs regClass:$vdata), (ins VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>; } let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs regClass:$vdata), (ins VReg_64:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, + SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"# - "$glc"#"$slc"#"$tfe", + name#" $vdata, $vaddr, $srsrc, $soffset addr64$offset$glc$slc$tfe", [(set load_vt:$vdata, (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, @@ -2509,18 +3183,11 @@ multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, ValueType store_vt = i32, SDPatternOperator st = null_frag> { let mayLoad = 0, mayStore = 1 in { - defm : MUBUF_m <op, name, (outs), - (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, offen:$offen, idxen:$idxen, glc:$glc, slc:$slc, - tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset"#"$offen"#"$idxen"#"$offset"# - "$glc"#"$slc"#"$tfe", []>; - let offen = 0, idxen = 0, vaddr = 0 in { defm _OFFSET : MUBUF_m <op, name#"_offset",(outs), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $srsrc, $soffset"#"$offset"#"$glc"#"$slc"#"$tfe", + offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, off, $srsrc, $soffset$offset$glc$slc$tfe", [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))]>; } // offen = 0, idxen = 0, vaddr = 0 @@ -2528,35 +3195,35 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, let offen = 1, idxen = 0 in { defm _OFFEN : MUBUF_m <op, name#"_offen", (outs), (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset offen"#"$offset"# - "$glc"#"$slc"#"$tfe", []>; + name#" $vdata, $vaddr, $srsrc, $soffset offen"# + "$offset$glc$slc$tfe", []>; } // end offen = 1, idxen = 0 let offen = 0, idxen = 1 in { defm _IDXEN : MUBUF_m <op, name#"_idxen", (outs), (ins vdataClass:$vdata, VGPR_32:$vaddr, SReg_128:$srsrc, - SCSrc_32:$soffset, mbuf_offset:$offset, glc:$glc, + SCSrc_32:$soffset, offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + name#" $vdata, $vaddr, $srsrc, $soffset idxen$offset$glc$slc$tfe", []>; } let offen = 1, idxen = 1 in { defm _BOTHEN : MUBUF_m <op, name#"_bothen", (outs), (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), - name#" $vdata, $vaddr, $srsrc, $soffset idxen offen"#"$offset"#"$glc"#"$slc"#"$tfe", []>; + offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), + name#" $vdata, $vaddr, $srsrc, $soffset idxen offen$offset$glc$slc$tfe", []>; } let offen = 0, idxen = 0 in { defm _ADDR64 : MUBUFAddr64_m <op, name#"_addr64", (outs), (ins vdataClass:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, - mbuf_offset:$offset, glc:$glc, slc:$slc, + offset:$offset, glc:$glc, slc:$slc, tfe:$tfe), name#" $vdata, $vaddr, $srsrc, $soffset addr64"# - "$offset"#"$glc"#"$slc"#"$tfe", + "$offset$glc$slc$tfe", [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, @@ -2593,21 +3260,24 @@ class flat <bits<7> ci, bits<7> vi = ci> { class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : FLAT <0, outs, ins, "", pattern>, - SIMCInstr<opName, SISubtarget.NONE> { + SIMCInstr<opName, SIEncodingFamily.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> : FLAT <op, outs, ins, asm, []>, - SIMCInstr<opName, SISubtarget.SI> { + SIMCInstr<opName, SIEncodingFamily.SI> { let AssemblerPredicate = isCIOnly; + let DecoderNamespace="CI"; } class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> : FLAT <op, outs, ins, asm, []>, - SIMCInstr<opName, SISubtarget.VI> { + SIMCInstr<opName, SIEncodingFamily.VI> { let AssemblerPredicate = VIAssemblerPredicate; + let DecoderNamespace="VI"; + let DisableDecoder = DisableVIDecoder; } multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm, @@ -2623,8 +3293,8 @@ multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm, multiclass FLAT_Load_Helper <flat op, string asm_name, RegisterClass regClass, dag outs = (outs regClass:$vdst), - dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), - string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> { + dag ins = (ins VReg_64:$addr, glc:$glc, slc:$slc, tfe:$tfe), + string asm = asm_name#" $vdst, $addr$glc$slc$tfe"> { let data = 0, mayLoad = 1 in { @@ -2639,9 +3309,9 @@ multiclass FLAT_Load_Helper <flat op, string asm_name, multiclass FLAT_Store_Helper <flat op, string asm_name, RegisterClass vdataClass, dag outs = (outs), - dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc, - slc_flat:$slc, tfe_flat:$tfe), - string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> { + dag ins = (ins VReg_64:$addr, vdataClass:$data, glc:$glc, + slc:$slc, tfe:$tfe), + string asm = asm_name#" $addr, $data$glc$slc$tfe"> { let mayLoad = 0, mayStore = 1, vdst = 0 in { @@ -2654,32 +3324,36 @@ multiclass FLAT_Store_Helper <flat op, string asm_name, } multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc, + ValueType vt, SDPatternOperator atomic = null_frag, + ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - dag outs_noret = (outs), string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> { let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in { - def "" : FLAT_Pseudo <NAME, outs_noret, + def "" : FLAT_Pseudo <NAME, (outs), (ins VReg_64:$addr, data_rc:$data, - slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>, + slc:$slc, tfe:$tfe), []>, AtomicNoRet <NAME, 0>; - def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret, + def _ci : FLAT_Real_ci <op.CI, NAME, (outs), (ins VReg_64:$addr, data_rc:$data, - slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + slc:$slc, tfe:$tfe), asm_noret>; - def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret, + def _vi : FLAT_Real_vi <op.VI, NAME, (outs), (ins VReg_64:$addr, data_rc:$data, - slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + slc:$slc, tfe:$tfe), asm_noret>; } let glc = 1, hasPostISelHook = 1 in { - defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst), - (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, - tfe_flat_atomic:$tfe), - asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>; + defm _RTN : FLAT_AtomicRet_m < + op, (outs vdst_rc:$vdst), + (ins VReg_64:$addr, data_rc:$data, slc:$slc, tfe:$tfe), + asm_name#" $vdst, $addr, $data glc$slc$tfe", + [(set vt:$vdst, + (atomic (FLATAtomic i64:$addr, i1:$slc, i1:$tfe), data_vt:$data))] + >; } } @@ -2688,27 +3362,39 @@ class MIMG_Mask <string op, int channels> { int Channels = channels; } +class mimg <bits<7> si, bits<7> vi = si> { + field bits<7> SI = si; + field bits<7> VI = vi; +} + +class MIMG_Helper <dag outs, dag ins, string asm, + string dns=""> : MIMG<outs, ins, asm,[]> { + let mayLoad = 1; + let mayStore = 0; + let hasPostISelHook = 1; + let DecoderNamespace = dns; + let isAsmParserOnly = !if(!eq(dns,""), 1, 0); + let AsmMatchConverter = "cvtMIMG"; +} + class MIMG_NoSampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, - RegisterClass src_rc> : MIMG < - op, + RegisterClass addr_rc, + string dns=""> : MIMG_Helper < (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc", - []> { + (ins addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da", + dns>, MIMGe<op> { let ssamp = 0; - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; } multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, int channels> { - def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32>, + def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32, + !if(!eq(channels, 1), "AMDGPU", "")>, MIMG_Mask<asm#"_V1", channels>; def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>, MIMG_Mask<asm#"_V2", channels>; @@ -2723,27 +3409,116 @@ multiclass MIMG_NoSampler <bits<7> op, string asm> { defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>; } +class MIMG_Store_Helper <bits<7> op, string asm, + RegisterClass data_rc, + RegisterClass addr_rc> : MIMG_Helper < + (outs), + (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + >, MIMGe<op> { + let ssamp = 0; + let mayLoad = 1; // TableGen requires this for matching with the intrinsics + let mayStore = 1; + let hasSideEffects = 1; + let hasPostISelHook = 0; +} + +multiclass MIMG_Store_Addr_Helper <bits<7> op, string asm, + RegisterClass data_rc, + int channels> { + def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32>, + MIMG_Mask<asm#"_V1", channels>; + def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>, + MIMG_Mask<asm#"_V2", channels>; + def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>, + MIMG_Mask<asm#"_V4", channels>; +} + +multiclass MIMG_Store <bits<7> op, string asm> { + defm _V1 : MIMG_Store_Addr_Helper <op, asm, VGPR_32, 1>; + defm _V2 : MIMG_Store_Addr_Helper <op, asm, VReg_64, 2>; + defm _V3 : MIMG_Store_Addr_Helper <op, asm, VReg_96, 3>; + defm _V4 : MIMG_Store_Addr_Helper <op, asm, VReg_128, 4>; +} + +class MIMG_Atomic_Helper <string asm, RegisterClass data_rc, + RegisterClass addr_rc> : MIMG_Helper < + (outs data_rc:$vdst), + (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdst, $vaddr, $srsrc$dmask$unorm$glc$slc$r128$tfe$lwe$da" + > { + let mayStore = 1; + let hasSideEffects = 1; + let hasPostISelHook = 0; + let Constraints = "$vdst = $vdata"; + let AsmMatchConverter = "cvtMIMGAtomic"; +} + +class MIMG_Atomic_Real_si<mimg op, string name, string asm, + RegisterClass data_rc, RegisterClass addr_rc> : + MIMG_Atomic_Helper<asm, data_rc, addr_rc>, + SIMCInstr<name, SIEncodingFamily.SI>, + MIMGe<op.SI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isSICI]; + let DecoderNamespace = "SICI"; + let DisableDecoder = DisableSIDecoder; +} + +class MIMG_Atomic_Real_vi<mimg op, string name, string asm, + RegisterClass data_rc, RegisterClass addr_rc> : + MIMG_Atomic_Helper<asm, data_rc, addr_rc>, + SIMCInstr<name, SIEncodingFamily.VI>, + MIMGe<op.VI> { + let isCodeGenOnly = 0; + let AssemblerPredicates = [isVI]; + let DecoderNamespace = "VI"; + let DisableDecoder = DisableVIDecoder; +} + +multiclass MIMG_Atomic_Helper_m <mimg op, string name, string asm, + RegisterClass data_rc, RegisterClass addr_rc> { + let isPseudo = 1, isCodeGenOnly = 1 in { + def "" : MIMG_Atomic_Helper<asm, data_rc, addr_rc>, + SIMCInstr<name, SIEncodingFamily.NONE>; + } + + let ssamp = 0 in { + def _si : MIMG_Atomic_Real_si<op, name, asm, data_rc, addr_rc>; + + def _vi : MIMG_Atomic_Real_vi<op, name, asm, data_rc, addr_rc>; + } +} + +multiclass MIMG_Atomic <mimg op, string asm, RegisterClass data_rc = VGPR_32> { + defm _V1 : MIMG_Atomic_Helper_m <op, asm # "_V1", asm, data_rc, VGPR_32>; + defm _V2 : MIMG_Atomic_Helper_m <op, asm # "_V2", asm, data_rc, VReg_64>; + defm _V4 : MIMG_Atomic_Helper_m <op, asm # "_V3", asm, data_rc, VReg_128>; +} + class MIMG_Sampler_Helper <bits<7> op, string asm, RegisterClass dst_rc, - RegisterClass src_rc, int wqm> : MIMG < - op, + RegisterClass src_rc, + int wqm, + string dns=""> : MIMG_Helper < (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { - let mayLoad = 1; - let mayStore = 0; - let hasPostISelHook = 1; + (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + dns>, MIMGe<op> { let WQM = wqm; } multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, RegisterClass dst_rc, int channels, int wqm> { - def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm>, + def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VGPR_32, wqm, + !if(!eq(channels, 1), "AMDGPU", "")>, MIMG_Mask<asm#"_V1", channels>; def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64, wqm>, MIMG_Mask<asm#"_V2", channels>; @@ -2755,31 +3530,24 @@ multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm, MIMG_Mask<asm#"_V16", channels>; } -multiclass MIMG_Sampler <bits<7> op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 0>; - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 0>; - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 0>; - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 0>; +multiclass MIMG_Sampler <bits<7> op, string asm, int wqm=0> { + defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, wqm>; + defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, wqm>; + defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, wqm>; + defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, wqm>; } -multiclass MIMG_Sampler_WQM <bits<7> op, string asm> { - defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VGPR_32, 1, 1>; - defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2, 1>; - defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3, 1>; - defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4, 1>; -} +multiclass MIMG_Sampler_WQM <bits<7> op, string asm> : MIMG_Sampler<op, asm, 1>; class MIMG_Gather_Helper <bits<7> op, string asm, RegisterClass dst_rc, RegisterClass src_rc, int wqm> : MIMG < - op, (outs dst_rc:$vdata), - (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, - i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr, - SReg_256:$srsrc, SReg_128:$ssamp), - asm#" $vdata, $dmask, $unorm, $glc, $da, $r128," - #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp", - []> { + (ins src_rc:$vaddr, SReg_256:$srsrc, SReg_128:$ssamp, + dmask:$dmask, unorm:$unorm, glc:$glc, slc:$slc, + r128:$r128, tfe:$tfe, lwe:$lwe, da:$da), + asm#" $vdata, $vaddr, $srsrc, $ssamp$dmask$unorm$glc$slc$r128$tfe$lwe$da", + []>, MIMGe<op> { let mayLoad = 1; let mayStore = 0; @@ -2789,10 +3557,12 @@ class MIMG_Gather_Helper <bits<7> op, string asm, // 1=red, 2=green, 4=blue, 8=alpha. (e.g. 1 returns // (red,red,red,red) etc.) The ISA document doesn't mention // this. - // Therefore, disable all code which updates DMASK by setting these two: - let MIMG = 0; + // Therefore, disable all code which updates DMASK by setting this: + let Gather4 = 1; let hasPostISelHook = 0; let WQM = wqm; + + let isAsmParserOnly = 1; // TBD: fix it later } multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, @@ -2810,19 +3580,14 @@ multiclass MIMG_Gather_Src_Helper <bits<7> op, string asm, MIMG_Mask<asm#"_V16", channels>; } -multiclass MIMG_Gather <bits<7> op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 0>; - defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 0>; - defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 0>; - defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 0>; +multiclass MIMG_Gather <bits<7> op, string asm, int wqm=0> { + defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, wqm>; + defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, wqm>; + defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, wqm>; + defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, wqm>; } -multiclass MIMG_Gather_WQM <bits<7> op, string asm> { - defm _V1 : MIMG_Gather_Src_Helper<op, asm, VGPR_32, 1, 1>; - defm _V2 : MIMG_Gather_Src_Helper<op, asm, VReg_64, 2, 1>; - defm _V3 : MIMG_Gather_Src_Helper<op, asm, VReg_96, 3, 1>; - defm _V4 : MIMG_Gather_Src_Helper<op, asm, VReg_128, 4, 1>; -} +multiclass MIMG_Gather_WQM <bits<7> op, string asm> : MIMG_Gather<op, asm, 1>; //===----------------------------------------------------------------------===// // Vector instruction mappings @@ -2894,8 +3659,9 @@ def getMCOpcodeGen : InstrMapping { let FilterClass = "SIMCInstr"; let RowFields = ["PseudoInstr"]; let ColFields = ["Subtarget"]; - let KeyCol = [!cast<string>(SISubtarget.NONE)]; - let ValueCols = [[!cast<string>(SISubtarget.SI)],[!cast<string>(SISubtarget.VI)]]; + let KeyCol = [!cast<string>(SIEncodingFamily.NONE)]; + let ValueCols = [[!cast<string>(SIEncodingFamily.SI)], + [!cast<string>(SIEncodingFamily.VI)]]; } def getAddr64Inst : InstrMapping { diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 89692ab71f4d..6427db87cd6f 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -18,35 +18,17 @@ int P20 = 1; } def INTERP : InterpSlots; -def InterpSlot : Operand<i32> { - let PrintMethod = "printInterpSlot"; -} - -def SendMsgImm : Operand<i32> { - let PrintMethod = "printSendMsg"; -} - def isGCN : Predicate<"Subtarget->getGeneration() " - ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, + ">= SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, + "== SISubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureSouthernIslands">; def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; -def SWaitMatchClass : AsmOperandClass { - let Name = "SWaitCnt"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseSWaitCntOps"; -} - -def WAIT_FLAG : InstFlag<"printWaitFlag"> { - let ParserMatchClass = SWaitMatchClass; -} - let SubtargetPredicate = isGCN in { //===----------------------------------------------------------------------===// @@ -59,17 +41,17 @@ defm EXP : EXP_m; // SMRD Instructions //===----------------------------------------------------------------------===// -// We are using the SGPR_32 and not the SReg_32 register class for 32-bit -// SMRD instructions, because the SGPR_32 register class does not include M0 +// We are using the SReg_32_XM0 and not the SReg_32 register class for 32-bit +// SMRD instructions, because the SReg_32_XM0 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SReg_32_XM0>; defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>; defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>; defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>; defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32 + smrd<0x08>, "s_buffer_load_dword", SReg_128, SReg_32_XM0 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < @@ -88,7 +70,15 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; -//def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; +let mayStore = ? in { +// FIXME: mayStore = ? is a workaround for tablegen bug for different +// inferred mayStore flags for the instruction pattern vs. standalone +// Pat. Each considers the other contradictory. + +defm S_MEMTIME : SMRD_Special <smrd<0x1e, 0x24>, "s_memtime", + (outs SReg_64:$sdst), ?, " $sdst", [(set i64:$sdst, (int_amdgcn_s_memtime))] +>; +} defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv", int_amdgcn_s_dcache_inv>; @@ -101,7 +91,7 @@ let isMoveImm = 1 in { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { defm S_MOV_B32 : SOP1_32 <sop1<0x03, 0x00>, "s_mov_b32", []>; defm S_MOV_B64 : SOP1_64 <sop1<0x04, 0x01>, "s_mov_b64", []>; - } // let isRematerializeable = 1 + } // End isRematerializeable = 1 let Uses = [SCC] in { defm S_CMOV_B32 : SOP1_32 <sop1<0x05, 0x02>, "s_cmov_b32", []>; @@ -111,11 +101,11 @@ let isMoveImm = 1 in { let Defs = [SCC] in { defm S_NOT_B32 : SOP1_32 <sop1<0x07, 0x04>, "s_not_b32", - [(set i32:$dst, (not i32:$src0))] + [(set i32:$sdst, (not i32:$src0))] >; defm S_NOT_B64 : SOP1_64 <sop1<0x08, 0x05>, "s_not_b64", - [(set i64:$dst, (not i64:$src0))] + [(set i64:$sdst, (not i64:$src0))] >; defm S_WQM_B32 : SOP1_32 <sop1<0x09, 0x06>, "s_wqm_b32", []>; defm S_WQM_B64 : SOP1_64 <sop1<0x0a, 0x07>, "s_wqm_b64", []>; @@ -123,7 +113,7 @@ let Defs = [SCC] in { defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", - [(set i32:$dst, (bitreverse i32:$src0))] + [(set i32:$sdst, (bitreverse i32:$src0))] >; defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; @@ -131,7 +121,7 @@ let Defs = [SCC] in { defm S_BCNT0_I32_B32 : SOP1_32 <sop1<0x0d, 0x0a>, "s_bcnt0_i32_b32", []>; defm S_BCNT0_I32_B64 : SOP1_32_64 <sop1<0x0e, 0x0b>, "s_bcnt0_i32_b64", []>; defm S_BCNT1_I32_B32 : SOP1_32 <sop1<0x0f, 0x0c>, "s_bcnt1_i32_b32", - [(set i32:$dst, (ctpop i32:$src0))] + [(set i32:$sdst, (ctpop i32:$src0))] >; defm S_BCNT1_I32_B64 : SOP1_32_64 <sop1<0x10, 0x0d>, "s_bcnt1_i32_b64", []>; } // End Defs = [SCC] @@ -139,34 +129,34 @@ let Defs = [SCC] in { defm S_FF0_I32_B32 : SOP1_32 <sop1<0x11, 0x0e>, "s_ff0_i32_b32", []>; defm S_FF0_I32_B64 : SOP1_32_64 <sop1<0x12, 0x0f>, "s_ff0_i32_b64", []>; defm S_FF1_I32_B32 : SOP1_32 <sop1<0x13, 0x10>, "s_ff1_i32_b32", - [(set i32:$dst, (cttz_zero_undef i32:$src0))] + [(set i32:$sdst, (cttz_zero_undef i32:$src0))] >; defm S_FF1_I32_B64 : SOP1_32_64 <sop1<0x14, 0x11>, "s_ff1_i32_b64", []>; defm S_FLBIT_I32_B32 : SOP1_32 <sop1<0x15, 0x12>, "s_flbit_i32_b32", - [(set i32:$dst, (AMDGPUffbh_u32 i32:$src0))] + [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] >; defm S_FLBIT_I32_B64 : SOP1_32_64 <sop1<0x16, 0x13>, "s_flbit_i32_b64", []>; defm S_FLBIT_I32 : SOP1_32 <sop1<0x17, 0x14>, "s_flbit_i32", - [(set i32:$dst, (int_AMDGPU_flbit_i32 i32:$src0))] + [(set i32:$sdst, (int_AMDGPU_flbit_i32 i32:$src0))] >; defm S_FLBIT_I32_I64 : SOP1_32_64 <sop1<0x18, 0x15>, "s_flbit_i32_i64", []>; defm S_SEXT_I32_I8 : SOP1_32 <sop1<0x19, 0x16>, "s_sext_i32_i8", - [(set i32:$dst, (sext_inreg i32:$src0, i8))] + [(set i32:$sdst, (sext_inreg i32:$src0, i8))] >; defm S_SEXT_I32_I16 : SOP1_32 <sop1<0x1a, 0x17>, "s_sext_i32_i16", - [(set i32:$dst, (sext_inreg i32:$src0, i16))] + [(set i32:$sdst, (sext_inreg i32:$src0, i16))] >; defm S_BITSET0_B32 : SOP1_32 <sop1<0x1b, 0x18>, "s_bitset0_b32", []>; -defm S_BITSET0_B64 : SOP1_64 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; +defm S_BITSET0_B64 : SOP1_64_32 <sop1<0x1c, 0x19>, "s_bitset0_b64", []>; defm S_BITSET1_B32 : SOP1_32 <sop1<0x1d, 0x1a>, "s_bitset1_b32", []>; -defm S_BITSET1_B64 : SOP1_64 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; +defm S_BITSET1_B64 : SOP1_64_32 <sop1<0x1e, 0x1b>, "s_bitset1_b64", []>; defm S_GETPC_B64 : SOP1_64_0 <sop1<0x1f, 0x1c>, "s_getpc_b64", []>; -defm S_SETPC_B64 : SOP1_64 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; +defm S_SETPC_B64 : SOP1_1 <sop1<0x20, 0x1d>, "s_setpc_b64", []>; defm S_SWAPPC_B64 : SOP1_64 <sop1<0x21, 0x1e>, "s_swappc_b64", []>; -defm S_RFE_B64 : SOP1_64 <sop1<0x22, 0x1f>, "s_rfe_b64", []>; +defm S_RFE_B64 : SOP1_1 <sop1<0x22, 0x1f>, "s_rfe_b64", []>; let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in { @@ -206,36 +196,36 @@ let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { defm S_ADD_U32 : SOP2_32 <sop2<0x00>, "s_add_u32", []>; defm S_ADD_I32 : SOP2_32 <sop2<0x02>, "s_add_i32", - [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))] + [(set i32:$sdst, (add SSrc_32:$src0, SSrc_32:$src1))] >; } // End isCommutable = 1 defm S_SUB_U32 : SOP2_32 <sop2<0x01>, "s_sub_u32", []>; defm S_SUB_I32 : SOP2_32 <sop2<0x03>, "s_sub_i32", - [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))] + [(set i32:$sdst, (sub SSrc_32:$src0, SSrc_32:$src1))] >; let Uses = [SCC] in { // Carry in comes from SCC let isCommutable = 1 in { defm S_ADDC_U32 : SOP2_32 <sop2<0x04>, "s_addc_u32", - [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; + [(set i32:$sdst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End isCommutable = 1 defm S_SUBB_U32 : SOP2_32 <sop2<0x05>, "s_subb_u32", - [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; + [(set i32:$sdst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>; } // End Uses = [SCC] defm S_MIN_I32 : SOP2_32 <sop2<0x06>, "s_min_i32", - [(set i32:$dst, (smin i32:$src0, i32:$src1))] + [(set i32:$sdst, (smin i32:$src0, i32:$src1))] >; defm S_MIN_U32 : SOP2_32 <sop2<0x07>, "s_min_u32", - [(set i32:$dst, (umin i32:$src0, i32:$src1))] + [(set i32:$sdst, (umin i32:$src0, i32:$src1))] >; defm S_MAX_I32 : SOP2_32 <sop2<0x08>, "s_max_i32", - [(set i32:$dst, (smax i32:$src0, i32:$src1))] + [(set i32:$sdst, (smax i32:$src0, i32:$src1))] >; defm S_MAX_U32 : SOP2_32 <sop2<0x09>, "s_max_u32", - [(set i32:$dst, (umax i32:$src0, i32:$src1))] + [(set i32:$sdst, (umax i32:$src0, i32:$src1))] >; } // End Defs = [SCC] @@ -247,27 +237,27 @@ let Uses = [SCC] in { let Defs = [SCC] in { defm S_AND_B32 : SOP2_32 <sop2<0x0e, 0x0c>, "s_and_b32", - [(set i32:$dst, (and i32:$src0, i32:$src1))] + [(set i32:$sdst, (and i32:$src0, i32:$src1))] >; defm S_AND_B64 : SOP2_64 <sop2<0x0f, 0x0d>, "s_and_b64", - [(set i64:$dst, (and i64:$src0, i64:$src1))] + [(set i64:$sdst, (and i64:$src0, i64:$src1))] >; defm S_OR_B32 : SOP2_32 <sop2<0x10, 0x0e>, "s_or_b32", - [(set i32:$dst, (or i32:$src0, i32:$src1))] + [(set i32:$sdst, (or i32:$src0, i32:$src1))] >; defm S_OR_B64 : SOP2_64 <sop2<0x11, 0x0f>, "s_or_b64", - [(set i64:$dst, (or i64:$src0, i64:$src1))] + [(set i64:$sdst, (or i64:$src0, i64:$src1))] >; defm S_XOR_B32 : SOP2_32 <sop2<0x12, 0x10>, "s_xor_b32", - [(set i32:$dst, (xor i32:$src0, i32:$src1))] + [(set i32:$sdst, (xor i32:$src0, i32:$src1))] >; defm S_XOR_B64 : SOP2_64 <sop2<0x13, 0x11>, "s_xor_b64", - [(set i64:$dst, (xor i64:$src0, i64:$src1))] + [(set i64:$sdst, (xor i64:$src0, i64:$src1))] >; defm S_ANDN2_B32 : SOP2_32 <sop2<0x14, 0x12>, "s_andn2_b32", []>; defm S_ANDN2_B64 : SOP2_64 <sop2<0x15, 0x13>, "s_andn2_b64", []>; @@ -286,30 +276,30 @@ let AddedComplexity = 1 in { let Defs = [SCC] in { defm S_LSHL_B32 : SOP2_32 <sop2<0x1e, 0x1c>, "s_lshl_b32", - [(set i32:$dst, (shl i32:$src0, i32:$src1))] + [(set i32:$sdst, (shl i32:$src0, i32:$src1))] >; defm S_LSHL_B64 : SOP2_64_32 <sop2<0x1f, 0x1d>, "s_lshl_b64", - [(set i64:$dst, (shl i64:$src0, i32:$src1))] + [(set i64:$sdst, (shl i64:$src0, i32:$src1))] >; defm S_LSHR_B32 : SOP2_32 <sop2<0x20, 0x1e>, "s_lshr_b32", - [(set i32:$dst, (srl i32:$src0, i32:$src1))] + [(set i32:$sdst, (srl i32:$src0, i32:$src1))] >; defm S_LSHR_B64 : SOP2_64_32 <sop2<0x21, 0x1f>, "s_lshr_b64", - [(set i64:$dst, (srl i64:$src0, i32:$src1))] + [(set i64:$sdst, (srl i64:$src0, i32:$src1))] >; defm S_ASHR_I32 : SOP2_32 <sop2<0x22, 0x20>, "s_ashr_i32", - [(set i32:$dst, (sra i32:$src0, i32:$src1))] + [(set i32:$sdst, (sra i32:$src0, i32:$src1))] >; defm S_ASHR_I64 : SOP2_64_32 <sop2<0x23, 0x21>, "s_ashr_i64", - [(set i64:$dst, (sra i64:$src0, i32:$src1))] + [(set i64:$sdst, (sra i64:$src0, i32:$src1))] >; } // End Defs = [SCC] defm S_BFM_B32 : SOP2_32 <sop2<0x24, 0x22>, "s_bfm_b32", - [(set i32:$dst, (AMDGPUbfm i32:$src0, i32:$src1))]>; -defm S_BFM_B64 : SOP2_64 <sop2<0x25, 0x23>, "s_bfm_b64", []>; + [(set i32:$sdst, (AMDGPUbfm i32:$src0, i32:$src1))]>; +defm S_BFM_B64 : SOP2_64_32_32 <sop2<0x25, 0x23>, "s_bfm_b64", []>; defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", - [(set i32:$dst, (mul i32:$src0, i32:$src1))] + [(set i32:$sdst, (mul i32:$src0, i32:$src1))] >; } // End AddedComplexity = 1 @@ -317,7 +307,7 @@ defm S_MUL_I32 : SOP2_32 <sop2<0x26, 0x24>, "s_mul_i32", let Defs = [SCC] in { defm S_BFE_U32 : SOP2_32 <sop2<0x27, 0x25>, "s_bfe_u32", []>; defm S_BFE_I32 : SOP2_32 <sop2<0x28, 0x26>, "s_bfe_i32", []>; -defm S_BFE_U64 : SOP2_64 <sop2<0x29, 0x27>, "s_bfe_u64", []>; +defm S_BFE_U64 : SOP2_64_32 <sop2<0x29, 0x27>, "s_bfe_u64", []>; defm S_BFE_I64 : SOP2_64_32 <sop2<0x2a, 0x28>, "s_bfe_i64", []>; } // End Defs = [SCC] @@ -336,23 +326,23 @@ defm S_ABSDIFF_I32 : SOP2_32 <sop2<0x2c, 0x2a>, "s_absdiff_i32", []>; // SOPC Instructions //===----------------------------------------------------------------------===// -def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "s_cmp_eq_i32">; -def S_CMP_LG_I32 : SOPC_32 <0x00000001, "s_cmp_lg_i32">; -def S_CMP_GT_I32 : SOPC_32 <0x00000002, "s_cmp_gt_i32">; -def S_CMP_GE_I32 : SOPC_32 <0x00000003, "s_cmp_ge_i32">; -def S_CMP_LT_I32 : SOPC_32 <0x00000004, "s_cmp_lt_i32">; -def S_CMP_LE_I32 : SOPC_32 <0x00000005, "s_cmp_le_i32">; -def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "s_cmp_eq_u32">; -def S_CMP_LG_U32 : SOPC_32 <0x00000007, "s_cmp_lg_u32">; -def S_CMP_GT_U32 : SOPC_32 <0x00000008, "s_cmp_gt_u32">; -def S_CMP_GE_U32 : SOPC_32 <0x00000009, "s_cmp_ge_u32">; -def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "s_cmp_lt_u32">; -def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; -////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "s_bitcmp0_b32", []>; -////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "s_bitcmp1_b32", []>; -////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "s_bitcmp0_b64", []>; -////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "s_bitcmp1_b64", []>; -//def S_SETVSKIP : SOPC_ <0x00000010, "s_setvskip", []>; +def S_CMP_EQ_I32 : SOPC_CMP_32 <0x00000000, "s_cmp_eq_i32", COND_EQ>; +def S_CMP_LG_I32 : SOPC_CMP_32 <0x00000001, "s_cmp_lg_i32", COND_NE>; +def S_CMP_GT_I32 : SOPC_CMP_32 <0x00000002, "s_cmp_gt_i32", COND_SGT>; +def S_CMP_GE_I32 : SOPC_CMP_32 <0x00000003, "s_cmp_ge_i32", COND_SGE>; +def S_CMP_LT_I32 : SOPC_CMP_32 <0x00000004, "s_cmp_lt_i32", COND_SLT>; +def S_CMP_LE_I32 : SOPC_CMP_32 <0x00000005, "s_cmp_le_i32", COND_SLE>; +def S_CMP_EQ_U32 : SOPC_CMP_32 <0x00000006, "s_cmp_eq_u32", COND_EQ>; +def S_CMP_LG_U32 : SOPC_CMP_32 <0x00000007, "s_cmp_lg_u32", COND_NE >; +def S_CMP_GT_U32 : SOPC_CMP_32 <0x00000008, "s_cmp_gt_u32", COND_UGT>; +def S_CMP_GE_U32 : SOPC_CMP_32 <0x00000009, "s_cmp_ge_u32", COND_UGE>; +def S_CMP_LT_U32 : SOPC_CMP_32 <0x0000000a, "s_cmp_lt_u32", COND_ULT>; +def S_CMP_LE_U32 : SOPC_CMP_32 <0x0000000b, "s_cmp_le_u32", COND_ULE>; +def S_BITCMP0_B32 : SOPC_32 <0x0000000c, "s_bitcmp0_b32">; +def S_BITCMP1_B32 : SOPC_32 <0x0000000d, "s_bitcmp1_b32">; +def S_BITCMP0_B64 : SOPC_64_32 <0x0000000e, "s_bitcmp0_b64">; +def S_BITCMP1_B64 : SOPC_64_32 <0x0000000f, "s_bitcmp1_b64">; +def S_SETVSKIP : SOPC_32 <0x00000010, "s_setvskip">; //===----------------------------------------------------------------------===// // SOPK Instructions @@ -408,16 +398,23 @@ defm S_CBRANCH_I_FORK : SOPK_m < sopk<0x11, 0x10>, "s_cbranch_i_fork", (outs), (ins SReg_64:$sdst, u16imm:$simm16), " $sdst, $simm16" >; -defm S_GETREG_B32 : SOPK_32 <sopk<0x12, 0x11>, "s_getreg_b32", []>; + +let mayLoad = 1 in { +defm S_GETREG_B32 : SOPK_m < + sopk<0x12, 0x11>, "s_getreg_b32", (outs SReg_32:$sdst), + (ins hwreg:$simm16), " $sdst, $simm16" +>; +} + defm S_SETREG_B32 : SOPK_m < sopk<0x13, 0x12>, "s_setreg_b32", (outs), - (ins SReg_32:$sdst, u16imm:$simm16), " $sdst, $simm16" + (ins SReg_32:$sdst, hwreg:$simm16), " $simm16, $sdst" >; // FIXME: Not on SI? //defm S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32", []>; defm S_SETREG_IMM32_B32 : SOPK_IMM32 < sopk<0x15, 0x14>, "s_setreg_imm32_b32", (outs), - (ins i32imm:$imm, u16imm:$simm16), " $imm, $simm16" + (ins i32imm:$imm, hwreg:$simm16), " $simm16, $imm" >; //===----------------------------------------------------------------------===// @@ -429,10 +426,11 @@ def S_NOP : SOPP <0x00000000, (ins i16imm:$simm16), "s_nop $simm16">; let isTerminator = 1 in { def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm", - [(IL_retflag)]> { + [(AMDGPUendpgm)]> { let simm16 = 0; let isBarrier = 1; let hasCtrlDep = 1; + let hasSideEffects = 1; } let isBranch = 1 in { @@ -449,7 +447,8 @@ def S_CBRANCH_SCC0 : SOPP < >; def S_CBRANCH_SCC1 : SOPP < 0x00000005, (ins sopp_brtarget:$simm16), - "s_cbranch_scc1 $simm16" + "s_cbranch_scc1 $simm16", + [(si_uniform_br_scc SCC, bb:$simm16)] >; } // End Uses = [SCC] @@ -481,7 +480,7 @@ def S_CBRANCH_EXECNZ : SOPP < let hasSideEffects = 1 in { def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", - [(int_AMDGPU_barrier_local)] + [(int_amdgcn_s_barrier)] > { let SchedRW = [WriteBarrier]; let simm16 = 0; @@ -490,18 +489,31 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", let isConvergent = 1; } +let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">; -def S_SLEEP : SOPP <0x0000000e, (ins i16imm:$simm16), "s_sleep $simm16">; -def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$sim16), "s_setprio $sim16">; + +// On SI the documentation says sleep for approximately 64 * low 2 +// bits, consistent with the reported maximum of 448. On VI the +// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the +// maximum really 15 on VI? +def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16), + "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> { + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; +} + +def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">; let Uses = [EXEC, M0] in { + // FIXME: Should this be mayLoad+mayStore? def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16", [(AMDGPUsendmsg (i32 imm:$simm16))] >; } // End Uses = [EXEC, M0] -def S_SENDMSGHALT : SOPP <0x00000011, (ins i16imm:$simm16), "s_sendmsghalt $simm16">; +def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">; def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">; def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> { let simm16 = 0; @@ -770,8 +782,8 @@ defm DS_XOR_B32 : DS_1A1D_NORET <0xb, "ds_xor_b32", VGPR_32>; defm DS_MSKOR_B32 : DS_1A2D_NORET <0xc, "ds_mskor_b32", VGPR_32>; let mayLoad = 0 in { defm DS_WRITE_B32 : DS_1A1D_NORET <0xd, "ds_write_b32", VGPR_32>; -defm DS_WRITE2_B32 : DS_1A1D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; -defm DS_WRITE2ST64_B32 : DS_1A1D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; +defm DS_WRITE2_B32 : DS_1A2D_Off8_NORET <0xe, "ds_write2_b32", VGPR_32>; +defm DS_WRITE2ST64_B32 : DS_1A2D_Off8_NORET <0xf, "ds_write2st64_b32", VGPR_32>; } defm DS_CMPST_B32 : DS_1A2D_NORET <0x10, "ds_cmpst_b32", VGPR_32>; defm DS_CMPST_F32 : DS_1A2D_NORET <0x11, "ds_cmpst_f32", VGPR_32>; @@ -811,7 +823,11 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; -defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; + +let Uses = [EXEC], mayLoad =0, mayStore = 0, isConvergent = 1 in { +defm DS_SWIZZLE_B32 : DS_1A_RET_ <dsop<0x35, 0x3d>, "ds_swizzle_b32", VGPR_32>; +} + let mayStore = 0 in { defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; defm DS_READ2_B32 : DS_1A_Off8_RET <0x37, "ds_read2_b32", VReg_64>; @@ -839,8 +855,8 @@ defm DS_XOR_B64 : DS_1A1D_NORET <0x4b, "ds_xor_b64", VReg_64>; defm DS_MSKOR_B64 : DS_1A2D_NORET <0x4c, "ds_mskor_b64", VReg_64>; let mayLoad = 0 in { defm DS_WRITE_B64 : DS_1A1D_NORET <0x4d, "ds_write_b64", VReg_64>; -defm DS_WRITE2_B64 : DS_1A1D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; -defm DS_WRITE2ST64_B64 : DS_1A1D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; +defm DS_WRITE2_B64 : DS_1A2D_Off8_NORET <0x4E, "ds_write2_b64", VReg_64>; +defm DS_WRITE2ST64_B64 : DS_1A2D_Off8_NORET <0x4f, "ds_write2st64_b64", VReg_64>; } defm DS_CMPST_B64 : DS_1A2D_NORET <0x50, "ds_cmpst_b64", VReg_64>; defm DS_CMPST_F64 : DS_1A2D_NORET <0x51, "ds_cmpst_f64", VReg_64>; @@ -886,7 +902,7 @@ defm DS_MAX_SRC2_U32 : DS_1A <0x88, "ds_max_src2_u32">; defm DS_AND_SRC2_B32 : DS_1A <0x89, "ds_and_src_b32">; defm DS_OR_SRC2_B32 : DS_1A <0x8a, "ds_or_src2_b32">; defm DS_XOR_SRC2_B32 : DS_1A <0x8b, "ds_xor_src2_b32">; -defm DS_WRITE_SRC2_B32 : DS_1A <0x8c, "ds_write_src2_b32">; +defm DS_WRITE_SRC2_B32 : DS_1A_Off8_NORET <0x8d, "ds_write_src2_b32">; defm DS_MIN_SRC2_F32 : DS_1A <0x92, "ds_min_src2_f32">; defm DS_MAX_SRC2_F32 : DS_1A <0x93, "ds_max_src2_f32">; @@ -903,7 +919,7 @@ defm DS_MAX_SRC2_U64 : DS_1A <0xc8, "ds_max_src2_u64">; defm DS_AND_SRC2_B64 : DS_1A <0xc9, "ds_and_src2_b64">; defm DS_OR_SRC2_B64 : DS_1A <0xca, "ds_or_src2_b64">; defm DS_XOR_SRC2_B64 : DS_1A <0xcb, "ds_xor_src2_b64">; -defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; +defm DS_WRITE_SRC2_B64 : DS_1A_Off8_NORET <0xcd, "ds_write_src2_b64">; defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; @@ -937,16 +953,16 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Store_Helper < mubuf<0x07>, "buffer_store_format_xyzw", VReg_128 >; defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper < - mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, az_extloadi8_global + mubuf<0x08, 0x10>, "buffer_load_ubyte", VGPR_32, i32, mubuf_az_extloadi8 >; defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper < - mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, sextloadi8_global + mubuf<0x09, 0x11>, "buffer_load_sbyte", VGPR_32, i32, mubuf_sextloadi8 >; defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper < - mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, az_extloadi16_global + mubuf<0x0a, 0x12>, "buffer_load_ushort", VGPR_32, i32, mubuf_az_extloadi16 >; defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < - mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global + mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, mubuf_sextloadi16 >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load @@ -981,7 +997,9 @@ defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper < defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global >; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", []>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic < + mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag +>; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global >; @@ -1010,30 +1028,61 @@ defm BUFFER_ATOMIC_OR : MUBUF_Atomic < defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global >; -//def BUFFER_ATOMIC_INC : MUBUF_ <mubuf<0x3c, 0x4b>, "buffer_atomic_inc", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ <mubuf<0x3d, 0x4c>, "buffer_atomic_dec", []>; -//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI -//def BUFFER_ATOMIC_FMIN : MUBUF_ <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI -//def BUFFER_ATOMIC_FMAX : MUBUF_ <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI -//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", []>; -//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <mubuf<0x52, 0x62>, "buffer_atomic_add_x2", []>; -//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", []>; -//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI -//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", []>; -//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", []>; -//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", []>; -//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", []>; -//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <mubuf<0x59, 0x68>, "buffer_atomic_and_x2", []>; -//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", []>; -//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", []>; +defm BUFFER_ATOMIC_INC : MUBUF_Atomic < + mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC : MUBUF_Atomic < + mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global +>; + +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_Atomic <mubuf<0x3e>, "buffer_atomic_fcmpswap", []>; // isn't on VI +//def BUFFER_ATOMIC_FMIN : MUBUF_Atomic <mubuf<0x3f>, "buffer_atomic_fmin", []>; // isn't on VI +//def BUFFER_ATOMIC_FMAX : MUBUF_Atomic <mubuf<0x40>, "buffer_atomic_fmax", []>; // isn't on VI +defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Atomic < + mubuf<0x50, 0x60>, "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global +>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < + mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag +>; +defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Atomic < + mubuf<0x52, 0x62>, "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global +>; +defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Atomic < + mubuf<0x53, 0x63>, "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global +>; +//defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Atomic <mubuf<0x54>, "buffer_atomic_rsub_x2", []>; // isn't on CI & VI +defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Atomic < + mubuf<0x55, 0x64>, "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global +>; +defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Atomic < + mubuf<0x56, 0x65>, "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global +>; +defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Atomic < + mubuf<0x57, 0x66>, "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global +>; +defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Atomic < + mubuf<0x58, 0x67>, "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global +>; +defm BUFFER_ATOMIC_AND_X2 : MUBUF_Atomic < + mubuf<0x59, 0x68>, "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global +>; +defm BUFFER_ATOMIC_OR_X2 : MUBUF_Atomic < + mubuf<0x5a, 0x69>, "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global +>; +defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Atomic < + mubuf<0x5b, 0x6a>, "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global +>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic < + mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic < + mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global +>; //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI -let SubtargetPredicate = isSI in { +let SubtargetPredicate = isSI, DisableVIDecoder = 1 in { defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI } @@ -1062,28 +1111,28 @@ defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "image_load_mip">; //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"image_load_pck_sgn", 0x00000003>; //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"image_load_mip_pck", 0x00000004>; //def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"image_load_mip_pck_sgn", 0x00000005>; -//def IMAGE_STORE : MIMG_NoPattern_ <"image_store", 0x00000008>; -//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"image_store_mip", 0x00000009>; +defm IMAGE_STORE : MIMG_Store <0x00000008, "image_store">; +defm IMAGE_STORE_MIP : MIMG_Store <0x00000009, "image_store_mip">; //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"image_store_pck", 0x0000000a>; //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"image_store_mip_pck", 0x0000000b>; defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "image_get_resinfo">; -//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"image_atomic_swap", 0x0000000f>; -//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"image_atomic_cmpswap", 0x00000010>; -//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"image_atomic_add", 0x00000011>; -//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"image_atomic_sub", 0x00000012>; -//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"image_atomic_smin", 0x00000014>; -//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"image_atomic_umin", 0x00000015>; -//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"image_atomic_smax", 0x00000016>; -//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"image_atomic_umax", 0x00000017>; -//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"image_atomic_and", 0x00000018>; -//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"image_atomic_or", 0x00000019>; -//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"image_atomic_xor", 0x0000001a>; -//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"image_atomic_inc", 0x0000001b>; -//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"image_atomic_dec", 0x0000001c>; -//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; +defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimg<0x0f, 0x10>, "image_atomic_swap">; +defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimg<0x10, 0x11>, "image_atomic_cmpswap", VReg_64>; +defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimg<0x11, 0x12>, "image_atomic_add">; +defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimg<0x12, 0x13>, "image_atomic_sub">; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"image_atomic_rsub", 0x00000013>; -- not on VI +defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimg<0x14>, "image_atomic_smin">; +defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimg<0x15>, "image_atomic_umin">; +defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimg<0x16>, "image_atomic_smax">; +defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimg<0x17>, "image_atomic_umax">; +defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimg<0x18>, "image_atomic_and">; +defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimg<0x19>, "image_atomic_or">; +defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimg<0x1a>, "image_atomic_xor">; +defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimg<0x1b>, "image_atomic_inc">; +defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"image_atomic_fcmpswap", 0x0000001d>; -- not on VI +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI defm IMAGE_SAMPLE : MIMG_Sampler_WQM <0x00000020, "image_sample">; defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <0x00000021, "image_sample_cl">; defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "image_sample_d">; @@ -1171,10 +1220,12 @@ let Uses = [EXEC] in { def V_READFIRSTLANE_B32 : VOP1 < 0x00000002, (outs SReg_32:$vdst), - (ins VGPR_32:$src0), + (ins VS_32:$src0), "v_readfirstlane_b32 $vdst, $src0", [] ->; +> { + let isConvergent = 1; +} } @@ -1234,7 +1285,7 @@ defm V_CVT_F64_U32 : VOP1Inst <vop1<0x16>, "v_cvt_f64_u32", VOP_F64_I32, uint_to_fp >; -} // let SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteQuarterRate32] defm V_FRACT_F32 : VOP1Inst <vop1<0x20, 0x1b>, "v_fract_f32", VOP_F32_F32, AMDGPUfract @@ -1270,7 +1321,7 @@ defm V_RSQ_F32 : VOP1Inst <vop1<0x2e, 0x24>, "v_rsq_f32", VOP_F32_F32, AMDGPUrsq >; -} //let SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { @@ -1281,7 +1332,7 @@ defm V_RSQ_F64 : VOP1Inst <vop1<0x31, 0x26>, "v_rsq_f64", VOP_F64_F64, AMDGPUrsq >; -} // let SchedRW = [WriteDouble]; +} // End SchedRW = [WriteDouble]; defm V_SQRT_F32 : VOP1Inst <vop1<0x33, 0x27>, "v_sqrt_f32", VOP_F32_F32, fsqrt @@ -1312,34 +1363,34 @@ defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; defm V_FFBL_B32 : VOP1Inst <vop1<0x3a, 0x2e>, "v_ffbl_b32", VOP_I32_I32>; defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", - VOP_I32_F64 + VOP_I32_F64, int_amdgcn_frexp_exp >; let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", - VOP_F64_F64 + VOP_F64_F64, int_amdgcn_frexp_mant >; defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", - VOP_F64_F64 + VOP_F64_F64, AMDGPUfract >; } // End SchedRW = [WriteDoubleAdd] defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", - VOP_I32_F32 + VOP_I32_F32, int_amdgcn_frexp_exp >; defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", - VOP_F32_F32 + VOP_F32_F32, int_amdgcn_frexp_mant >; let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { -defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>; +defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NO_EXT<VOP_NONE>>; } let Uses = [M0, EXEC] in { -defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; -defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; -defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; +defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_NO_EXT<VOP_I32_I32>>; +defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_NO_EXT<VOP_I32_I32>>; +defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>; } // End Uses = [M0, EXEC] // These instruction only exist on SI and CI @@ -1348,11 +1399,12 @@ let SubtargetPredicate = isSICI in { let SchedRW = [WriteQuarterRate32] in { defm V_MOV_FED_B32 : VOP1InstSI <vop1<0x9>, "v_mov_fed_b32", VOP_I32_I32>; -defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", VOP_F32_F32>; +defm V_LOG_CLAMP_F32 : VOP1InstSI <vop1<0x26>, "v_log_clamp_f32", + VOP_F32_F32, int_amdgcn_log_clamp>; defm V_RCP_CLAMP_F32 : VOP1InstSI <vop1<0x28>, "v_rcp_clamp_f32", VOP_F32_F32>; defm V_RCP_LEGACY_F32 : VOP1InstSI <vop1<0x29>, "v_rcp_legacy_f32", VOP_F32_F32>; defm V_RSQ_CLAMP_F32 : VOP1InstSI <vop1<0x2c>, "v_rsq_clamp_f32", - VOP_F32_F32, AMDGPUrsq_clamped + VOP_F32_F32, AMDGPUrsq_clamp >; defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy @@ -1364,7 +1416,7 @@ let SchedRW = [WriteDouble] in { defm V_RCP_CLAMP_F64 : VOP1InstSI <vop1<0x30>, "v_rcp_clamp_f64", VOP_F64_F64>; defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", - VOP_F64_F64, AMDGPUrsq_clamped + VOP_F64_F64, AMDGPUrsq_clamp >; } // End SchedRW = [WriteDouble] @@ -1394,11 +1446,11 @@ defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; } // End OtherPredicates = [has32BankLDS] -let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst" in { +let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 in { defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; -} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst" +} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $dst", isAsmParserOnly=1 let DisableEncoding = "$src0", Constraints = "$src0 = $dst" in { @@ -1426,15 +1478,9 @@ defm V_INTERP_MOV_F32 : VINTRP_m < // VOP2 Instructions //===----------------------------------------------------------------------===// -multiclass V_CNDMASK <vop2 op, string name> { - defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>; - - defm _e64 : VOP3_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, - name#!cast<string>(VOP_CNDMASK.Asm64), [], name, 3>; -} - -defm V_CNDMASK_B32 : V_CNDMASK<vop2<0x0>, "v_cndmask_b32">; +defm V_CNDMASK_B32 : VOP2eInst <vop2<0x0, 0x0>, "v_cndmask_b32", + VOP2e_I32_I32_I32_I1 +>; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <vop2<0x3, 0x1>, "v_add_f32", @@ -1450,7 +1496,7 @@ defm V_SUBREV_F32 : VOP2Inst <vop2<0x5, 0x3>, "v_subrev_f32", let isCommutable = 1 in { defm V_MUL_LEGACY_F32 : VOP2Inst <vop2<0x7, 0x4>, "v_mul_legacy_f32", - VOP_F32_F32_F32, int_AMDGPU_mul + VOP_F32_F32_F32 >; defm V_MUL_F32 : VOP2Inst <vop2<0x8, 0x5>, "v_mul_f32", @@ -1501,16 +1547,16 @@ defm V_AND_B32 : VOP2Inst <vop2<0x1b, 0x13>, "v_and_b32", VOP_I32_I32_I32>; defm V_OR_B32 : VOP2Inst <vop2<0x1c, 0x14>, "v_or_b32", VOP_I32_I32_I32>; defm V_XOR_B32 : VOP2Inst <vop2<0x1d, 0x15>, "v_xor_b32", VOP_I32_I32_I32>; -let Constraints = "$dst = $src2", DisableEncoding="$src2", +let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst <vop2<0x1f, 0x16>, "v_mac_f32", VOP_MAC>; } } // End isCommutable = 1 -defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32">; +defm V_MADMK_F32 : VOP2MADK <vop2<0x20, 0x17>, "v_madmk_f32", VOP_MADMK>; let isCommutable = 1 in { -defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; +defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32", VOP_MADAK>; } // End isCommutable = 1 let isCommutable = 1 in { @@ -1540,11 +1586,14 @@ defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", } // End isCommutable = 1 +// These are special and do not read the exec mask. +let isConvergent = 1, Uses = []<Register> in { + defm V_READLANE_B32 : VOP2SI_3VI_m < vop3 <0x001, 0x289>, "v_readlane_b32", (outs SReg_32:$vdst), - (ins VGPR_32:$src0, SCSrc_32:$src1), + (ins VS_32:$src0, SCSrc_32:$src1), "v_readlane_b32 $vdst, $src0, $src1" >; @@ -1556,6 +1605,8 @@ defm V_WRITELANE_B32 : VOP2SI_3VI_m < "v_writelane_b32 $vdst, $src0, $src1" >; +} // End isConvergent = 1 + // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -1636,16 +1687,16 @@ defm V_MAD_U32_U24 : VOP3Inst <vop3<0x143, 0x1c3>, "v_mad_u32_u24", } // End isCommutable = 1 defm V_CUBEID_F32 : VOP3Inst <vop3<0x144, 0x1c4>, "v_cubeid_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubeid >; defm V_CUBESC_F32 : VOP3Inst <vop3<0x145, 0x1c5>, "v_cubesc_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubesc >; defm V_CUBETC_F32 : VOP3Inst <vop3<0x146, 0x1c6>, "v_cubetc_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubetc >; defm V_CUBEMA_F32 : VOP3Inst <vop3<0x147, 0x1c7>, "v_cubema_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, int_amdgcn_cubema >; defm V_BFE_U32 : VOP3Inst <vop3<0x148, 0x1c8>, "v_bfe_u32", @@ -1666,6 +1717,10 @@ defm V_FMA_F32 : VOP3Inst <vop3<0x14b, 0x1cb>, "v_fma_f32", defm V_FMA_F64 : VOP3Inst <vop3<0x14c, 0x1cc>, "v_fma_f64", VOP_F64_F64_F64_F64, fma >; + +defm V_LERP_U8 : VOP3Inst <vop3<0x14d, 0x1cd>, "v_lerp_u8", + VOP_I32_I32_I32_I32, int_amdgcn_lerp +>; } // End isCommutable = 1 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "v_lerp_u8", []>; @@ -1695,13 +1750,13 @@ defm V_MAX3_U32 : VOP3Inst <vop3<0x156, 0x1d5>, "v_max3_u32", VOP_I32_I32_I32_I32, AMDGPUumax3 >; defm V_MED3_F32 : VOP3Inst <vop3<0x157, 0x1d6>, "v_med3_f32", - VOP_F32_F32_F32_F32 + VOP_F32_F32_F32_F32, AMDGPUfmed3 >; defm V_MED3_I32 : VOP3Inst <vop3<0x158, 0x1d7>, "v_med3_i32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUsmed3 >; defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", - VOP_I32_I32_I32_I32 + VOP_I32_I32_I32_I32, AMDGPUumed3 >; //def V_SAD_U8 : VOP3_U8 <0x0000015a, "v_sad_u8", []>; @@ -1710,7 +1765,7 @@ defm V_MED3_U32 : VOP3Inst <vop3<0x159, 0x1d8>, "v_med3_u32", defm V_SAD_U32 : VOP3Inst <vop3<0x15d, 0x1dc>, "v_sad_u32", VOP_I32_I32_I32_I32 >; -////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; +//def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "v_cvt_pk_u8_f32", []>; defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; @@ -1727,26 +1782,26 @@ let SchedRW = [WriteDoubleAdd] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", - VOP_F64_F64_F64, fadd + VOP_F64_F64_F64, fadd, 1 >; defm V_MUL_F64 : VOP3Inst <vop3<0x165, 0x281>, "v_mul_f64", - VOP_F64_F64_F64, fmul + VOP_F64_F64_F64, fmul, 1 >; defm V_MIN_F64 : VOP3Inst <vop3<0x166, 0x282>, "v_min_f64", - VOP_F64_F64_F64, fminnum + VOP_F64_F64_F64, fminnum, 1 >; defm V_MAX_F64 : VOP3Inst <vop3<0x167, 0x283>, "v_max_f64", - VOP_F64_F64_F64, fmaxnum + VOP_F64_F64_F64, fmaxnum, 1 >; -} // isCommutable = 1 +} // End isCommutable = 1 defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", - VOP_F64_F64_I32, AMDGPUldexp + VOP_F64_F64_I32, AMDGPUldexp, 1 >; -} // let SchedRW = [WriteDoubleAdd] +} // End let SchedRW = [WriteDoubleAdd] let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { @@ -1754,30 +1809,33 @@ defm V_MUL_LO_U32 : VOP3Inst <vop3<0x169, 0x285>, "v_mul_lo_u32", VOP_I32_I32_I32 >; defm V_MUL_HI_U32 : VOP3Inst <vop3<0x16a, 0x286>, "v_mul_hi_u32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, mulhu >; +let DisableVIDecoder=1 in { // removed from VI as identical to V_MUL_LO_U32 defm V_MUL_LO_I32 : VOP3Inst <vop3<0x16b, 0x285>, "v_mul_lo_i32", VOP_I32_I32_I32 >; +} + defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, mulhs >; -} // isCommutable = 1, SchedRW = [WriteQuarterRate32] +} // End isCommutable = 1, SchedRW = [WriteQuarterRate32] let SchedRW = [WriteFloatFMA, WriteSALU] in { defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32", - VOP3b_F32_I1_F32_F32_F32 + VOP3b_F32_I1_F32_F32_F32, [], 1 >; } let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64", - VOP3b_F64_I1_F64_F64_F64 + VOP3b_F64_I1_F64_F64_F64, [], 1 >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] let isCommutable = 1, Uses = [VCC, EXEC] in { @@ -1814,7 +1872,7 @@ defm V_TRIG_PREOP_F64 : VOP3Inst < vop3<0x174, 0x292>, "v_trig_preop_f64", VOP_F64_F64_I32, AMDGPUtrig_preop >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] // These instructions only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -1828,7 +1886,7 @@ defm V_MULLIT_F32 : VOP3Inst <vop3<0x150>, "v_mullit_f32", } // End SubtargetPredicate = isSICI -let SubtargetPredicate = isVI in { +let SubtargetPredicate = isVI, DisableSIDecoder = 1 in { defm V_LSHLREV_B64 : VOP3Inst <vop3<0, 0x28f>, "v_lshlrev_b64", VOP_I64_I32_I64 @@ -1845,113 +1903,145 @@ defm V_ASHRREV_I64 : VOP3Inst <vop3<0, 0x291>, "v_ashrrev_i64", //===----------------------------------------------------------------------===// // Pseudo Instructions //===----------------------------------------------------------------------===// -let isCodeGenOnly = 1, isPseudo = 1 in { + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // For use in patterns -def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), - (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] ->; +def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), + (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", []> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} -let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; -} // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 - -let hasSideEffects = 1, SALU = 1 in { -def SGPR_USE : InstSI <(outs),(ins), "", []>; +def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_64:$src0)> { + let VALU = 1; } +} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] + +let usesCustomInserter = 1, SALU = 1 in { +def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins), + [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; +} // End let usesCustomInserter = 1, SALU = 1 // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { -let Uses = [EXEC], Defs = [EXEC] in { +let hasSideEffects = 1 in { + +// Dummy terminator instruction to use after control flow instructions +// replaced with exec mask operations. +def SI_MASK_BRANCH : PseudoInstSI < + (outs), (ins brtarget:$target, SReg_64:$dst)> { + let isBranch = 1; + let isTerminator = 1; + let isBarrier = 1; + let SALU = 1; +} + +let Uses = [EXEC], Defs = [EXEC, SCC] in { let isBranch = 1, isTerminator = 1 in { -def SI_IF: InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, brtarget:$target), - "", - [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))] ->; +def SI_IF: PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), + [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]> { + let Constraints = ""; +} -def SI_ELSE : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src, brtarget:$target), - "", - [(set i64:$dst, (int_SI_else i64:$src, bb:$target))] -> { +def SI_ELSE : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), + [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> { let Constraints = "$src = $dst"; } -def SI_LOOP : InstSI < - (outs), - (ins SReg_64:$saved, brtarget:$target), - "si_loop $saved, $target", - [(int_SI_loop i64:$saved, bb:$target)] +def SI_LOOP : PseudoInstSI < + (outs), (ins SReg_64:$saved, brtarget:$target), + [(int_amdgcn_loop i64:$saved, bb:$target)] >; -} // end isBranch = 1, isTerminator = 1 +} // End isBranch = 1, isTerminator = 1 -def SI_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src), - "si_else $dst, $src", - [(set i64:$dst, (int_SI_break i64:$src))] + +def SI_BREAK : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$src), + [(set i64:$dst, (int_amdgcn_break i64:$src))] >; -def SI_IF_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$vcc, SReg_64:$src), - "si_if_break $dst, $vcc, $src", - [(set i64:$dst, (int_SI_if_break i1:$vcc, i64:$src))] +def SI_IF_BREAK : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), + [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))] >; -def SI_ELSE_BREAK : InstSI < - (outs SReg_64:$dst), - (ins SReg_64:$src0, SReg_64:$src1), - "si_else_break $dst, $src0, $src1", - [(set i64:$dst, (int_SI_else_break i64:$src0, i64:$src1))] +def SI_ELSE_BREAK : PseudoInstSI < + (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), + [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))] >; -def SI_END_CF : InstSI < - (outs), - (ins SReg_64:$saved), - "si_end_cf $saved", - [(int_SI_end_cf i64:$saved)] +def SI_END_CF : PseudoInstSI < + (outs), (ins SReg_64:$saved), + [(int_amdgcn_end_cf i64:$saved)] >; -} // End Uses = [EXEC], Defs = [EXEC] +} // End Uses = [EXEC], Defs = [EXEC, SCC] let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : InstSI < - (outs), - (ins VSrc_32:$src), - "si_kill $src", - [(int_AMDGPU_kill f32:$src)] ->; +def SI_KILL : PseudoInstSI < + (outs), (ins VSrc_32:$src), + [(int_AMDGPU_kill f32:$src)]> { + let isConvergent = 1; + let usesCustomInserter = 1; +} + +def SI_KILL_TERMINATOR : PseudoInstSI < + (outs), (ins VSrc_32:$src)> { + let isTerminator = 1; +} + } // End Uses = [EXEC], Defs = [EXEC,VCC] -} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 +} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 -let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { +def SI_PS_LIVE : PseudoInstSI < + (outs SReg_64:$dst), (ins), + [(set i1:$dst, (int_amdgcn_ps_live))]> { + let SALU = 1; +} -class SI_INDIRECT_SRC<RegisterClass rc> : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins rc:$src, VSrc_32:$idx, i32imm:$off), - "si_indirect_src $dst, $temp, $src, $idx, $off", - [] ->; +// Used as an isel pseudo to directly emit initialization with an +// s_mov_b32 rather than a copy of another initialized +// register. MachineCSE skips copies, and we don't want to have to +// fold operands before it runs. +def SI_INIT_M0 : PseudoInstSI <(outs), (ins SSrc_32:$src)> { + let Defs = [M0]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; + let SALU = 1; + let isReMaterializable = 1; +} -class SI_INDIRECT_DST<RegisterClass rc> : InstSI < - (outs rc:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val), - "si_indirect_dst $dst, $temp, $src, $idx, $off, $val", - [] -> { - let Constraints = "$src = $dst"; +def SI_RETURN : PseudoInstSI < + (outs), (ins variable_ops), [(AMDGPUreturn)]> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let hasSideEffects = 1; + let SALU = 1; + let hasNoSchedulingInfo = 1; +} + +let Uses = [EXEC], Defs = [EXEC, VCC, M0], + UseNamedOperandTable = 1 in { + +class SI_INDIRECT_SRC<RegisterClass rc> : PseudoInstSI < + (outs VGPR_32:$vdst, SReg_64:$sdst), + (ins rc:$src, VS_32:$idx, i32imm:$offset)>; + +class SI_INDIRECT_DST<RegisterClass rc> : PseudoInstSI < + (outs rc:$vdst, SReg_64:$sdst), + (ins unknown:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { + let Constraints = "$src = $vdst"; } // TODO: We can support indirect SGPR access. @@ -1967,25 +2057,20 @@ def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; -} // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0] +} // End Uses = [EXEC], Defs = [EXEC,VCC,M0] multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { - let UseNamedOperandTable = 1, Uses = [EXEC] in { - def _SAVE : InstSI < + def _SAVE : PseudoInstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx), - "", [] - > { + (ins sgpr_class:$src, i32imm:$frame_idx)> { let mayStore = 1; let mayLoad = 0; } - def _RESTORE : InstSI < + def _RESTORE : PseudoInstSI < (outs sgpr_class:$dst), - (ins i32imm:$frame_idx), - "", [] - > { + (ins i32imm:$frame_idx)> { let mayStore = 0; let mayLoad = 1; } @@ -1993,9 +2078,9 @@ multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { } // It's unclear whether you can use M0 as the output of v_readlane_b32 -// instructions, so use SGPR_32 register class for spills to prevent +// instructions, so use SReg_32_XM0 register class for spills to prevent // this from happening. -defm SI_SPILL_S32 : SI_SPILL_SGPR <SGPR_32>; +defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32_XM0>; defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; @@ -2003,21 +2088,18 @@ defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { - def _SAVE : InstSI < + def _SAVE : PseudoInstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), - "", [] - > { + SReg_32:$scratch_offset, i32imm:$offset)> { let mayStore = 1; let mayLoad = 0; } - def _RESTORE : InstSI < + def _RESTORE : PseudoInstSI < (outs vgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), - "", [] - > { + (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset, + i32imm:$offset)> { let mayStore = 0; let mayLoad = 1; } @@ -2033,29 +2115,19 @@ defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; let Defs = [SCC] in { -def SI_CONSTDATA_PTR : InstSI < +def SI_PC_ADD_REL_OFFSET : PseudoInstSI < (outs SReg_64:$dst), - (ins const_ga:$ptr), - "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))] -> { + (ins si_ga:$ptr), + [(set SReg_64:$dst, (i64 (SIpc_add_rel_offset (tglobaladdr:$ptr))))]> { let SALU = 1; } } // End Defs = [SCC] -} // end IsCodeGenOnly, isPseudo - -} // end SubtargetPredicate = isGCN +} // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { -def : Pat< - (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2), - (V_CNDMASK_B32_e64 $src2, $src1, - (V_CMP_GT_F32_e64 SRCMODS.NONE, 0, SRCMODS.NONE, $src0, - DSTCLAMP.NONE, DSTOMOD.NONE)) ->; - def : Pat < (int_AMDGPU_kilp), (SI_KILL 0xbf800000) @@ -2067,7 +2139,6 @@ def : Pat< (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0) >; -/* int_SI_export */ def : Pat < (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, f32:$src0, f32:$src1, f32:$src2, f32:$src3), @@ -2076,6 +2147,217 @@ def : Pat < >; //===----------------------------------------------------------------------===// +// buffer_load/store_format patterns +//===----------------------------------------------------------------------===// + +multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc)), + (!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (vt (name v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc)), + (!cast<MUBUF>(opcode # _BOTHEN) + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">; +defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">; +defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">; +defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, f32, "BUFFER_LOAD_DWORD">; +defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v2f32, "BUFFER_LOAD_DWORDX2">; +defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v4f32, "BUFFER_LOAD_DWORDX4">; + +multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt, + string opcode> { + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$glc, imm:$slc), + (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $glc), + (as_i1imm $slc), 0) + >; + + def : Pat< + (name vt:$vdata, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$glc, imm:$slc), + (!cast<MUBUF>(opcode # _BOTHEN) + $vdata, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), + (as_i1imm $glc), (as_i1imm $slc), 0) + >; +} + +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">; +defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">; + +//===----------------------------------------------------------------------===// +// buffer_atomic patterns +//===----------------------------------------------------------------------===// +multiclass BufferAtomicPatterns<SDPatternOperator name, string opcode> { + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast<MUBUF>(opcode # _RTN_OFFSET) $vdata_in, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (!cast<MUBUF>(opcode # _RTN_IDXEN) $vdata_in, $vindex, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast<MUBUF>(opcode # _RTN_OFFEN) $vdata_in, $voffset, $rsrc, $soffset, + (as_i16imm $offset), (as_i1imm $slc)) + >; + + def : Pat< + (name i32:$vdata_in, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (!cast<MUBUF>(opcode # _RTN_BOTHEN) + $vdata_in, + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)) + >; +} + +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_swap, "BUFFER_ATOMIC_SWAP">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_add, "BUFFER_ATOMIC_ADD">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_sub, "BUFFER_ATOMIC_SUB">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smin, "BUFFER_ATOMIC_SMIN">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umin, "BUFFER_ATOMIC_UMIN">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_smax, "BUFFER_ATOMIC_SMAX">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_umax, "BUFFER_ATOMIC_UMAX">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_and, "BUFFER_ATOMIC_AND">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_or, "BUFFER_ATOMIC_OR">; +defm : BufferAtomicPatterns<int_amdgcn_buffer_atomic_xor, "BUFFER_ATOMIC_XOR">; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFSET + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicOffset i32:$soffset, i16:$offset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_IDXEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, 0, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_OFFEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + +def : Pat< + (int_amdgcn_buffer_atomic_cmpswap + i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex, + (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset), + imm:$slc), + (EXTRACT_SUBREG + (BUFFER_ATOMIC_CMPSWAP_RTN_BOTHEN + (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1), + (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), + $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $slc)), + sub0) +>; + + +//===----------------------------------------------------------------------===// +// S_GETREG_B32 Intrinsic Pattern. +//===----------------------------------------------------------------------===// +def : Pat < + (int_amdgcn_s_getreg imm:$simm16), + (S_GETREG_B32 (as_i16imm $simm16)) +>; + +//===----------------------------------------------------------------------===// +// DS_SWIZZLE Intrinsic Pattern. +//===----------------------------------------------------------------------===// +def : Pat < + (int_amdgcn_ds_swizzle i32:$src, imm:$offset16), + (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0)) +>; + +//===----------------------------------------------------------------------===// // SMRD Patterns //===----------------------------------------------------------------------===// @@ -2109,7 +2391,6 @@ let AddedComplexity = 100 in { defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; -defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>; defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; @@ -2143,7 +2424,7 @@ def : Pat < def : Pat < (i64 (ctpop i64:$src)), (i64 (REG_SEQUENCE SReg_64, - (S_BCNT1_I32_B64 $src), sub0, + (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0, (S_MOV_B32 0), sub1)) >; @@ -2168,8 +2449,8 @@ def : Pat < //===----------------------------------------------------------------------===// def : Pat < - (int_AMDGPU_barrier_global), - (S_BARRIER) + (int_amdgcn_s_waitcnt i32:$simm16), + (S_WAITCNT (as_i16imm $simm16)) >; //===----------------------------------------------------------------------===// @@ -2184,7 +2465,22 @@ let Predicates = [UnsafeFPMath] in { def : RsqPat<V_RSQ_F32_e32, f32>; def : RsqPat<V_RSQ_F64_e32, f64>; -} + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [UnsafeFPMath] //===----------------------------------------------------------------------===// // VOP2 Patterns @@ -2217,9 +2513,9 @@ def : Pat < class SampleRawPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, i32:$dmask, i32:$unorm, i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc, $sampler) + (opcode $addr, $rsrc, $sampler, + (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) >; multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { @@ -2232,11 +2528,11 @@ multiclass SampleRawPatterns<SDPatternOperator name, string opcode> { // Image only class ImagePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v8i32:$rsrc, i32:$dmask, i32:$unorm, - i32:$r128, i32:$da, i32:$glc, i32:$slc, i32:$tfe, i32:$lwe), - (opcode (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $da), - (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $slc), - $addr, $rsrc) + (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$unorm, + imm:$r128, imm:$da, imm:$glc, imm:$slc, imm:$tfe, imm:$lwe), + (opcode $addr, $rsrc, + (as_i32imm $dmask), (as_i1imm $unorm), (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), (as_i1imm $tfe), (as_i1imm $lwe), (as_i1imm $da)) >; multiclass ImagePatterns<SDPatternOperator name, string opcode> { @@ -2245,6 +2541,54 @@ multiclass ImagePatterns<SDPatternOperator name, string opcode> { def : ImagePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; } +class ImageLoadPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name vt:$addr, v8i32:$rsrc, imm:$dmask, imm:$r128, imm:$da, imm:$glc, + imm:$slc), + (opcode $addr, $rsrc, + (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), 0, 0, (as_i1imm $da)) +>; + +multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> { + def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : ImageLoadPattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; +} + +class ImageStorePattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name v4f32:$data, vt:$addr, v8i32:$rsrc, i32:$dmask, imm:$r128, imm:$da, + imm:$glc, imm:$slc), + (opcode $data, $addr, $rsrc, + (as_i32imm $dmask), 1, (as_i1imm $glc), (as_i1imm $slc), + (as_i1imm $r128), 0, 0, (as_i1imm $da)) +>; + +multiclass ImageStorePatterns<SDPatternOperator name, string opcode> { + def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V1), i32>; + def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V2), v2i32>; + def : ImageStorePattern<name, !cast<MIMG>(opcode # _V4_V4), v4i32>; +} + +class ImageAtomicPattern<SDPatternOperator name, MIMG opcode, ValueType vt> : Pat < + (name i32:$vdata, vt:$addr, v8i32:$rsrc, imm:$r128, imm:$da, imm:$slc), + (opcode $vdata, $addr, $rsrc, 1, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)) +>; + +multiclass ImageAtomicPatterns<SDPatternOperator name, string opcode> { + def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V1), i32>; + def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V2), v2i32>; + def : ImageAtomicPattern<name, !cast<MIMG>(opcode # _V4), v4i32>; +} + +class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat < + (int_amdgcn_image_atomic_cmpswap i32:$vsrc, i32:$vcmp, vt:$addr, v8i32:$rsrc, + imm:$r128, imm:$da, imm:$slc), + (EXTRACT_SUBREG + (opcode (REG_SEQUENCE VReg_64, $vsrc, sub0, $vcmp, sub1), + $addr, $rsrc, 3, 1, 1, (as_i1imm $slc), (as_i1imm $r128), 0, 0, (as_i1imm $da)), + sub0) +>; + // Basic sample defm : SampleRawPatterns<int_SI_image_sample, "IMAGE_SAMPLE">; defm : SampleRawPatterns<int_SI_image_sample_cl, "IMAGE_SAMPLE_CL">; @@ -2341,38 +2685,57 @@ def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>; def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>; defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">; defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">; +defm : ImageLoadPatterns<int_amdgcn_image_load, "IMAGE_LOAD">; +defm : ImageLoadPatterns<int_amdgcn_image_load_mip, "IMAGE_LOAD_MIP">; +defm : ImageStorePatterns<int_amdgcn_image_store, "IMAGE_STORE">; +defm : ImageStorePatterns<int_amdgcn_image_store_mip, "IMAGE_STORE_MIP">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_swap, "IMAGE_ATOMIC_SWAP">; +def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V1, i32>; +def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V2, v2i32>; +def : ImageAtomicCmpSwapPattern<IMAGE_ATOMIC_CMPSWAP_V4, v4i32>; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_add, "IMAGE_ATOMIC_ADD">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_sub, "IMAGE_ATOMIC_SUB">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smin, "IMAGE_ATOMIC_SMIN">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umin, "IMAGE_ATOMIC_UMIN">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_smax, "IMAGE_ATOMIC_SMAX">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_umax, "IMAGE_ATOMIC_UMAX">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_and, "IMAGE_ATOMIC_AND">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_or, "IMAGE_ATOMIC_OR">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_xor, "IMAGE_ATOMIC_XOR">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_inc, "IMAGE_ATOMIC_INC">; +defm : ImageAtomicPatterns<int_amdgcn_image_atomic_dec, "IMAGE_ATOMIC_DEC">; /* SIsample for simple 1D texture lookup */ def : Pat < - (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (SIsample i32:$addr, v8i32:$rsrc, v4i32:$sampler, imm), + (IMAGE_SAMPLE_V4_V1 $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, imm), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT), - (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_RECT), + (opcode $addr, $rsrc, $sampler, 0xf, 1, 0, 0, 0, 0, 0, 0) >; class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_ARRAY), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; class SampleShadowPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 0) >; class SampleShadowArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat < - (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler) + (name vt:$addr, v8i32:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY), + (opcode $addr, $rsrc, $sampler, 0xf, 0, 0, 0, 0, 0, 0, 1) >; /* SIsample* for texture lookups consuming more address parameters */ @@ -2422,68 +2785,10 @@ defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16, IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16, v16i32>; -/* int_SI_imageload for texture fetches consuming varying address parameters */ -class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < - (name addr_type:$addr, v32i8:$rsrc, imm), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA), - (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc) ->; - -class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat < - (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA), - (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc) ->; - -multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> { - def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>; - def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>; -} - -multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> { - def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>; - def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>; -} - -defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>; -defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>; - -defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>; -defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>; - -/* Image resource information */ -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - -def : Pat < - (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA), - (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc) ->; - /********** ============================================ **********/ /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ -//def : Extract_Element<i64, v2i64, 0, sub0_sub1>; -//def : Extract_Element<i64, v2i64, 1, sub2_sub3>; -//def : Extract_Element<f64, v2f64, 0, sub0_sub1>; -//def : Extract_Element<f64, v2f64, 1, sub2_sub3>; - foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) @@ -2548,50 +2853,47 @@ foreach Index = 0-15 in { >; } -def : BitConvert <i32, f32, SReg_32>; +// FIXME: Why do only some of these type combinations for SReg and +// VReg? +// 32-bit bitcast def : BitConvert <i32, f32, VGPR_32>; - -def : BitConvert <f32, i32, SReg_32>; def : BitConvert <f32, i32, VGPR_32>; +def : BitConvert <i32, f32, SReg_32>; +def : BitConvert <f32, i32, SReg_32>; +// 64-bit bitcast def : BitConvert <i64, f64, VReg_64>; - def : BitConvert <f64, i64, VReg_64>; - -def : BitConvert <v2f32, v2i32, VReg_64>; def : BitConvert <v2i32, v2f32, VReg_64>; -def : BitConvert <v2i32, i64, VReg_64>; +def : BitConvert <v2f32, v2i32, VReg_64>; def : BitConvert <i64, v2i32, VReg_64>; -def : BitConvert <v2f32, i64, VReg_64>; +def : BitConvert <v2i32, i64, VReg_64>; def : BitConvert <i64, v2f32, VReg_64>; -def : BitConvert <v2f32, f64, VReg_64>; -def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <v2f32, i64, VReg_64>; def : BitConvert <f64, v2f32, VReg_64>; +def : BitConvert <v2f32, f64, VReg_64>; def : BitConvert <f64, v2i32, VReg_64>; -def : BitConvert <v4f32, v4i32, VReg_128>; +def : BitConvert <v2i32, f64, VReg_64>; def : BitConvert <v4i32, v4f32, VReg_128>; +def : BitConvert <v4f32, v4i32, VReg_128>; - +// 128-bit bitcast def : BitConvert <v2i64, v4i32, SReg_128>; def : BitConvert <v4i32, v2i64, SReg_128>; - def : BitConvert <v2f64, v4f32, VReg_128>; def : BitConvert <v2f64, v4i32, VReg_128>; def : BitConvert <v4f32, v2f64, VReg_128>; def : BitConvert <v4i32, v2f64, VReg_128>; +def : BitConvert <v2i64, v2f64, VReg_128>; +def : BitConvert <v2f64, v2i64, VReg_128>; - - - -def : BitConvert <v8f32, v8i32, SReg_256>; +// 256-bit bitcast def : BitConvert <v8i32, v8f32, SReg_256>; -def : BitConvert <v8i32, v32i8, SReg_256>; -def : BitConvert <v32i8, v8i32, SReg_256>; -def : BitConvert <v8i32, v32i8, VReg_256>; +def : BitConvert <v8f32, v8i32, SReg_256>; def : BitConvert <v8i32, v8f32, VReg_256>; def : BitConvert <v8f32, v8i32, VReg_256>; -def : BitConvert <v32i8, v8i32, VReg_256>; +// 512-bit bitcast def : BitConvert <v16i32, v16f32, VReg_512>; def : BitConvert <v16f32, v16i32, VReg_512>; @@ -2613,7 +2915,7 @@ def : Pat < def : Pat < (fneg (fabs f32:$src)), - (S_OR_B32 $src, 0x80000000) /* Set sign bit */ + (S_OR_B32 $src, 0x80000000) // Set sign bit >; // FIXME: Should use S_OR_B32 @@ -2703,15 +3005,9 @@ def : Pat < /********** Intrinsic Patterns **********/ /********** ================== **********/ -/* llvm.AMDGPU.pow */ def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; def : Pat < - (int_AMDGPU_div f32:$src0, f32:$src1), - (V_MUL_LEGACY_F32_e32 $src0, (V_RCP_LEGACY_F32_e32 $src1)) ->; - -def : Pat < (int_AMDGPU_cube v4f32:$src), (REG_SEQUENCE VReg_128, (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0), @@ -2745,7 +3041,7 @@ class Ext32Pat <SDNode ext> : Pat < def : Ext32Pat <zext>; def : Ext32Pat <anyext>; -// Offset in an 32Bit VGPR +// Offset in an 32-bit VGPR def : Pat < (SIload_constant v4i32:$sbase, i32:$voff), (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0) @@ -2759,12 +3055,6 @@ def : Pat < (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) >; -def : Pat < - (int_SI_tid), - (V_MBCNT_HI_U32_B32_e64 0xffffffff, - (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0)) ->; - //===----------------------------------------------------------------------===// // VOP3 Patterns //===----------------------------------------------------------------------===// @@ -2772,16 +3062,6 @@ def : Pat < def : IMad24Pat<V_MAD_I32_I24>; def : UMad24Pat<V_MAD_U32_U24>; -def : Pat < - (mulhu i32:$src0, i32:$src1), - (V_MUL_HI_U32 $src0, $src1) ->; - -def : Pat < - (mulhs i32:$src0, i32:$src1), - (V_MUL_HI_I32 $src0, $src1) ->; - defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; def : ROTRPattern <V_ALIGNBIT_B32>; @@ -2839,19 +3119,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < (inst $ptr, $value, (as_i16imm $offset), (i1 0)) >; -// Special case of DSAtomicRetPat for add / sub 1 -> inc / dec -// -// We need to use something for the data0, so we set a register to -// -1. For the non-rtn variants, the manual says it does -// DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max -// will always do the increment so I'm assuming it's the same. -class DSAtomicIncRetPat<DS inst, ValueType vt, - Instruction LoadImm, PatFrag frag> : Pat < - (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), - (inst $ptr, (LoadImm (vt -1)), (as_i16imm $offset), (i1 0)) ->; - - class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap), (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 0)) @@ -2859,14 +3126,11 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < // 32-bit atomics. -def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, - V_MOV_B32_e32, si_atomic_load_add_local>; -def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, - V_MOV_B32_e32, si_atomic_load_sub_local>; - def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; def : DSAtomicRetPat<DS_SUB_RTN_U32, i32, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_INC_RTN_U32, i32, si_atomic_inc_local>; +def : DSAtomicRetPat<DS_DEC_RTN_U32, i32, si_atomic_dec_local>; def : DSAtomicRetPat<DS_AND_RTN_B32, i32, si_atomic_load_and_local>; def : DSAtomicRetPat<DS_OR_RTN_B32, i32, si_atomic_load_or_local>; def : DSAtomicRetPat<DS_XOR_RTN_B32, i32, si_atomic_load_xor_local>; @@ -2874,18 +3138,14 @@ def : DSAtomicRetPat<DS_MIN_RTN_I32, i32, si_atomic_load_min_local>; def : DSAtomicRetPat<DS_MAX_RTN_I32, i32, si_atomic_load_max_local>; def : DSAtomicRetPat<DS_MIN_RTN_U32, i32, si_atomic_load_umin_local>; def : DSAtomicRetPat<DS_MAX_RTN_U32, i32, si_atomic_load_umax_local>; - def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; // 64-bit atomics. -def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, - V_MOV_B64_PSEUDO, si_atomic_load_add_local>; -def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, - V_MOV_B64_PSEUDO, si_atomic_load_sub_local>; - def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; def : DSAtomicRetPat<DS_SUB_RTN_U64, i64, si_atomic_load_sub_local>; +def : DSAtomicRetPat<DS_INC_RTN_U64, i64, si_atomic_inc_local>; +def : DSAtomicRetPat<DS_DEC_RTN_U64, i64, si_atomic_dec_local>; def : DSAtomicRetPat<DS_AND_RTN_B64, i64, si_atomic_load_and_local>; def : DSAtomicRetPat<DS_OR_RTN_B64, i64, si_atomic_load_or_local>; def : DSAtomicRetPat<DS_XOR_RTN_B64, i64, si_atomic_load_xor_local>; @@ -2901,20 +3161,35 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B64, i64, si_atomic_cmp_swap_64_local>; // MUBUF Patterns //===----------------------------------------------------------------------===// -multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, - PatFrag constant_ld> { - def : Pat < +class MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt, + PatFrag constant_ld> : Pat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe))), (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe) >; + +multiclass MUBUFLoad_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET, + ValueType vt, PatFrag atomic_ld> { + def : Pat < + (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + >; + + def : Pat < + (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + >; } let Predicates = [isSICI] in { -defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; -defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; +def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>; +def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>; +def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>; +def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>; + +defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>; +defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>; } // End Predicates = [isSICI] class MUBUFScratchLoadPat <MUBUF Instr, ValueType vt, PatFrag ld> : Pat < @@ -2975,6 +3250,25 @@ defm : MUBUF_Load_Dword <v2i32, BUFFER_LOAD_DWORDX2_OFFSET, BUFFER_LOAD_DWORDX2_ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>; +multiclass MUBUFStore_Atomic_Pattern <MUBUF Instr_ADDR64, MUBUF Instr_OFFSET, + ValueType vt, PatFrag atomic_st> { + // Store follows atomic op convention so address is forst + def : Pat < + (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), vt:$val), + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + >; + + def : Pat < + (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + >; +} +let Predicates = [isSICI] in { +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, global_store_atomic>; +defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>; +} // End Predicates = [isSICI] + class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat < (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset)), @@ -2987,22 +3281,6 @@ def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>; def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>; -/* -class MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> : Pat < - (st vt:$value, (MUBUFScratch v4i32:$srsrc, i64:$vaddr, u16imm:$offset)), - (Instr $value, $srsrc, $vaddr, $offset) ->; - -let Predicates = [isSICI] in { -def : MUBUFStore_Pattern <BUFFER_STORE_BYTE_ADDR64, i32, truncstorei8_private>; -def : MUBUFStore_Pattern <BUFFER_STORE_SHORT_ADDR64, i32, truncstorei16_private>; -def : MUBUFStore_Pattern <BUFFER_STORE_DWORD_ADDR64, i32, store_private>; -def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2_ADDR64, v2i32, store_private>; -def : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4_ADDR64, v4i32, store_private>; -} // End Predicates = [isSICI] - -*/ - //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// @@ -3029,29 +3307,16 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; /********** ====================== **********/ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { - - // 1. Extract with offset + // Extract with offset def : Pat< - (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), - (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) + (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; - // 2. Extract without offset + // Insert with offset def : Pat< - (eltvt (extractelt vt:$vec, i32:$idx)), - (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) - >; - - // 3. Insert with offset - def : Pat< - (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) - >; - - // 4. Insert without offset - def : Pat< - (insertelt vt:$vec, eltvt:$val, i32:$idx), - (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) + (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; } @@ -3111,10 +3376,12 @@ def : ZExt_i64_i32_Pat<anyext>; def : ZExt_i64_i1_Pat<zext>; def : ZExt_i64_i1_Pat<anyext>; +// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that +// REG_SEQUENCE patterns don't support instructions with multiple outputs. def : Pat < (i64 (sext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, - (S_ASHR_I32 $src, 31), sub1) + (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1) >; def : Pat < @@ -3214,6 +3481,23 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; def : BFEPattern <V_BFE_U32, S_MOV_B32>; +let Predicates = [isSICI] in { +def : Pat < + (i64 (readcyclecounter)), + (S_MEMTIME) +>; +} + +def : Pat< + (fcanonicalize f32:$src), + (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0) +>; + +def : Pat< + (fcanonicalize f64:$src), + (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0) +>; + //===----------------------------------------------------------------------===// // Fract Patterns //===----------------------------------------------------------------------===// @@ -3226,21 +3510,6 @@ let Predicates = [isSI] in { // The workaround for the V_FRACT bug is: // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) -// Convert (x + (-floor(x)) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_CNDMASK_B64_PSEUDO - (V_MIN_F64 - SRCMODS.NONE, - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE), - SRCMODS.NONE, - (V_MOV_B64_PSEUDO 0x3fefffffffffffff), - DSTCLAMP.NONE, DSTOMOD.NONE), - $x, - (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)) ->; - // Convert floor(x) to (x - fract(x)) def : Pat < (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), @@ -3268,6 +3537,9 @@ def : Pat < def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>; +def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>; +def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>; + //============================================================================// // Assembler aliases //============================================================================// diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index 027a0a2f5167..a9b7c39096e7 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -13,8 +13,6 @@ let TargetPrefix = "SI", isTarget = 1 in { - - def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>; def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; @@ -50,9 +48,9 @@ let TargetPrefix = "SI", isTarget = 1 in { llvm_i32_ty, // glc(imm) llvm_i32_ty, // slc(imm) llvm_i32_ty], // tfe(imm) - [IntrReadArgMem]>; + [IntrReadMem, IntrArgMemOnly]>; - def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>; // Fully-flexible SAMPLE instruction. class SampleRaw : Intrinsic < @@ -172,28 +170,20 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_image_load_mip : Image; def int_SI_getresinfo : Image; - // Deprecated image and sample intrinsics. - class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; - - def int_SI_sample : Sample; - def int_SI_sampleb : Sample; - def int_SI_sampled : Sample; - def int_SI_samplel : Sample; - def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_resinfo : Intrinsic <[llvm_v4i32_ty], [llvm_i32_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; - /* Interpolation Intrinsics */ def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; +} // End TargetPrefix = "SI", isTarget = 1 +let TargetPrefix = "amdgcn", isTarget = 1 in { /* Control flow Intrinsics */ - def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; - def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; - def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; - def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; - def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; - def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; + def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; + def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; + def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; + def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; + def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; + def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; + def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>; } diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 1bdb1f0ee9f9..9e972a569a0f 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -26,7 +26,7 @@ // // - Live interval recomputing seems inefficient. This currently only matches // one pair, and recomputes live intervals and moves on to the next pair. It -// would be better to compute a list of all merges that need to occur +// would be better to compute a list of all merges that need to occur. // // - With a list of instructions to process, we can also merge more. If a // cluster of loads have offsets that are too large to fit in the 8-bit @@ -36,6 +36,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" @@ -61,7 +62,6 @@ private: MachineRegisterInfo *MRI; LiveIntervals *LIS; - static bool offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned EltSize); @@ -69,10 +69,6 @@ private: MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, unsigned EltSize); - void updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx); - MachineBasicBlock::iterator mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -193,17 +189,6 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, return E; } -void SILoadStoreOptimizer::updateRegDefsUses(unsigned SrcReg, - unsigned DstReg, - unsigned SubIdx) { - for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg), - E = MRI->reg_end(); I != E; ) { - MachineOperand &O = *I; - ++I; - O.substVirtReg(DstReg, SubIdx, *TRI); - } -} - MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -268,19 +253,19 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - LIS->InsertMachineInstrInMaps(Read2); + LIS->InsertMachineInstrInMaps(*Read2); // repairLiveintervalsInRange() doesn't handle physical register, so we have // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); // The new write to the original destination register is now the copy. Steal // the old SlotIndex. - LIS->ReplaceMachineInstrInMaps(I, Copy0); - LIS->ReplaceMachineInstrInMaps(Paired, Copy1); + LIS->ReplaceMachineInstrInMaps(*I, *Copy0); + LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); I->eraseFromParent(); Paired->eraseFromParent(); @@ -291,7 +276,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( LIS->createAndComputeVirtRegInterval(DestReg); if (UpdateM0Range) { - SlotIndex Read2Index = LIS->getInstructionIndex(Read2); + SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); M0Segment->end = Read2Index.getRegSlot(); } @@ -340,7 +325,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // repairLiveintervalsInRange() doesn't handle physical register, so we have // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(Paired); + SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); @@ -359,8 +344,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( // XXX - How do we express subregisters here? unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; - LIS->RemoveMachineInstrFromMaps(I); - LIS->RemoveMachineInstrFromMaps(Paired); + LIS->RemoveMachineInstrFromMaps(*I); + LIS->RemoveMachineInstrFromMaps(*Paired); I->eraseFromParent(); Paired->eraseFromParent(); @@ -368,7 +353,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(Write2); + SlotIndex Write2Index = LIS->getInstructionIndex(*Write2); M0Segment->end = Write2Index.getRegSlot(); } @@ -423,9 +408,16 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { } bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { - const TargetSubtargetInfo &STM = MF.getSubtarget(); - TRI = static_cast<const SIRegisterInfo *>(STM.getRegisterInfo()); - TII = static_cast<const SIInstrInfo *>(STM.getInstrInfo()); + if (skipFunction(*MF.getFunction())) + return false; + + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + if (!STM.loadStoreOptEnabled()) + return false; + + TII = STM.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); LIS = &getAnalysis<LiveIntervals>(); diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 126f6245dfc0..ee1d5dae70b7 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -52,6 +52,7 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -61,24 +62,24 @@ using namespace llvm; -namespace { +#define DEBUG_TYPE "si-lower-control-flow" -class SILowerControlFlowPass : public MachineFunctionPass { +namespace { +class SILowerControlFlow : public MachineFunctionPass { private: static const unsigned SkipThreshold = 12; - static char ID; const SIRegisterInfo *TRI; const SIInstrInfo *TII; bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); void Skip(MachineInstr &From, MachineOperand &To); - void SkipIfDead(MachineInstr &MI); + bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); void If(MachineInstr &MI); - void Else(MachineInstr &MI); + void Else(MachineInstr &MI, bool ExecModified); void Break(MachineInstr &MI); void IfBreak(MachineInstr &MI); void ElseBreak(MachineInstr &MI); @@ -88,56 +89,118 @@ private: void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); - void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset); - void IndirectSrc(MachineInstr &MI); - void IndirectDst(MachineInstr &MI); + MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + std::pair<MachineBasicBlock *, MachineBasicBlock *> + splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + + void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg); + + void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, + MachineInstr *MovRel, + const MachineOperand &IdxReg, + int Offset); + + bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0); + std::pair<unsigned, int> computeIndirectRegAndOffset(unsigned VecReg, + int Offset) const; + bool indirectSrc(MachineInstr &MI); + bool indirectDst(MachineInstr &MI); public: - SILowerControlFlowPass(TargetMachine &tm) : + static char ID; + + SILowerControlFlow() : MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "SI Lower control flow instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); + return "SI Lower control flow pseudo instructions"; } }; } // End anonymous namespace -char SILowerControlFlowPass::ID = 0; +char SILowerControlFlow::ID = 0; + +INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, + "SI lower control flow", false, false) -FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { - return new SILowerControlFlowPass(tm); +char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID; + + +FunctionPass *llvm::createSILowerControlFlowPass() { + return new SILowerControlFlow(); } -bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, - MachineBasicBlock *To) { +static bool opcodeEmitsNoInsts(unsigned Opc) { + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::BUNDLE: + case TargetOpcode::CFI_INSTRUCTION: + case TargetOpcode::EH_LABEL: + case TargetOpcode::GC_LABEL: + case TargetOpcode::DBG_VALUE: + return true; + default: + return false; + } +} + +bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From, + MachineBasicBlock *To) { + if (From->succ_empty()) + return false; unsigned NumInstr = 0; + MachineFunction *MF = From->getParent(); - for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); - MBB = *MBB->succ_begin()) { + for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + MachineBasicBlock &MBB = *MBBI; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); NumInstr < SkipThreshold && I != E; ++I) { + if (opcodeEmitsNoInsts(I->getOpcode())) + continue; + + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken + // when EXEC = 0. We should skip the loop lest it becomes infinite. + if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ || + I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) + return true; + + if (I->isInlineAsm()) { + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + const char *AsmStr = I->getOperand(0).getSymbolName(); + + // inlineasm length estimate is number of bytes assuming the longest + // instruction. + uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); + NumInstr += MaxAsmSize / MAI->getMaxInstLength(); + } else { + ++NumInstr; + } - if (I->isBundle() || !I->isBundled()) - if (++NumInstr >= SkipThreshold) - return true; + if (NumInstr >= SkipThreshold) + return true; } } return false; } -void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { +void SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) { if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) return; @@ -147,40 +210,44 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { .addOperand(To); } -void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { - +bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); - if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() != - ShaderType::PIXEL || + if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) - return; + return false; + + MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); + MBB.addSuccessor(SkipBB); - MachineBasicBlock::iterator Insert = &MI; - ++Insert; + const DebugLoc &DL = MI.getDebugLoc(); // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3); + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&NextBB); + + MachineBasicBlock::iterator Insert = SkipBB->begin(); // Exec mask is zero: Export to NULL target... - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0); - - // ... and terminate wavefront - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef); + + // ... and terminate wavefront. + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + + return true; } -void SILowerControlFlowPass::If(MachineInstr &MI) { +void SILowerControlFlow::If(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Reg = MI.getOperand(0).getReg(); @@ -195,10 +262,15 @@ void SILowerControlFlowPass::If(MachineInstr &MI) { Skip(MI, MI.getOperand(2)); + // Insert a pseudo terminator to help keep the verifier happy. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addOperand(MI.getOperand(2)) + .addReg(Reg); + MI.eraseFromParent(); } -void SILowerControlFlowPass::Else(MachineInstr &MI) { +void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); @@ -208,22 +280,36 @@ void SILowerControlFlowPass::Else(MachineInstr &MI) { TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) .addReg(Src); // Saved EXEC + if (ExecModified) { + // Adjust the saved exec to account for the modifications during the flow + // block that contains the ELSE. This can happen when WQM mode is switched + // off. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + } + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(Dst); Skip(MI, MI.getOperand(2)); + // Insert a pseudo terminator to help keep the verifier happy. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH)) + .addOperand(MI.getOperand(2)) + .addReg(Dst); + MI.eraseFromParent(); } -void SILowerControlFlowPass::Break(MachineInstr &MI) { +void SILowerControlFlow::Break(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Src = MI.getOperand(1).getReg(); - + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(AMDGPU::EXEC) .addReg(Src); @@ -231,14 +317,14 @@ void SILowerControlFlowPass::Break(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { +void SILowerControlFlow::IfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Vcc = MI.getOperand(1).getReg(); unsigned Src = MI.getOperand(2).getReg(); - + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(Vcc) .addReg(Src); @@ -246,14 +332,14 @@ void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { +void SILowerControlFlow::ElseBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Saved = MI.getOperand(1).getReg(); unsigned Src = MI.getOperand(2).getReg(); - + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) .addReg(Saved) .addReg(Src); @@ -261,7 +347,7 @@ void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::Loop(MachineInstr &MI) { +void SILowerControlFlow::Loop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Src = MI.getOperand(0).getReg(); @@ -276,7 +362,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::EndCf(MachineInstr &MI) { +void SILowerControlFlow::EndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); unsigned Reg = MI.getOperand(0).getReg(); @@ -289,24 +375,24 @@ void SILowerControlFlowPass::EndCf(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::Branch(MachineInstr &MI) { - if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) +void SILowerControlFlow::Branch(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getOperand(0).getMBB(); + if (MBB == MI.getParent()->getNextNode()) MI.eraseFromParent(); // If these aren't equal, this is probably an infinite loop. } -void SILowerControlFlowPass::Kill(MachineInstr &MI) { +void SILowerControlFlow::Kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); const MachineOperand &Op = MI.getOperand(0); #ifndef NDEBUG - const SIMachineFunctionInfo *MFI - = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); + CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); // Kill is only allowed in pixel / geometry shaders. - assert(MFI->getShaderType() == ShaderType::PIXEL || - MFI->getShaderType() == ShaderType::GEOMETRY); + assert(CallConv == CallingConv::AMDGPU_PS || + CallConv == CallingConv::AMDGPU_GS); #endif // Clear this thread from the exec mask if the operand is negative @@ -325,94 +411,209 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { MI.eraseFromParent(); } -void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { +// All currently live registers must remain so in the remainder block. +void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg) { + // Add reg defined in loop body. + RemainderLiveRegs.addReg(SaveReg); + + if (const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val)) { + if (!Val->isUndef()) { + RemainderLiveRegs.addReg(Val->getReg()); + LoopBB.addLiveIn(Val->getReg()); + } + } + + for (unsigned Reg : RemainderLiveRegs) { + if (MRI.isAllocatable(Reg)) + RemainderBB.addLiveIn(Reg); + } + + const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); + if (!Src->isUndef()) + LoopBB.addLiveIn(Src->getReg()); + + if (!IdxReg.isUndef()) + LoopBB.addLiveIn(IdxReg.getReg()); + LoopBB.sortUniqueLiveIns(); +} + +void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, + DebugLoc DL, + MachineInstr *MovRel, + const MachineOperand &IdxReg, + int Offset) { + MachineBasicBlock::iterator I = LoopBB.begin(); + + // Read the next variant into VCC (lower 32 bits) <- also loop target + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO) + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + + // Move index from VCC into M0 + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(AMDGPU::VCC_LO); + + // Compare the just read M0 value to all possible Idx values + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) + .addReg(AMDGPU::M0) + .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef())); + + // Update EXEC, save the original EXEC value to VCC + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) + .addReg(AMDGPU::VCC); + + if (Offset != 0) { + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(AMDGPU::M0) + .addImm(Offset); + } + + // Do the actual move + LoopBB.insert(I, MovRel); + + // Update EXEC, switch all done bits to 0 and all todo bits to 1 + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + + // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover + BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&LoopBB); +} + +MachineBasicBlock *SILowerControlFlow::insertSkipBlock( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + MachineFunction *MF = MBB.getParent(); + + MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, SkipBB); + + return SkipBB; +} + +std::pair<MachineBasicBlock *, MachineBasicBlock *> +SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + MachineFunction *MF = MBB.getParent(); + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessors(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + +// Returns true if a new block was inserted. +bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - MachineBasicBlock::iterator I = MI; + MachineBasicBlock::iterator I(&MI); - unsigned Save = MI.getOperand(1).getReg(); - unsigned Idx = MI.getOperand(3).getReg(); + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); - if (AMDGPU::SReg_32RegClass.contains(Idx)) { - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(Idx) - .addImm(Offset); + if (AMDGPU::SReg_32RegClass.contains(Idx->getReg())) { + if (Offset != 0) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) + .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())) + .addImm(Offset); } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(Idx); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) + .addReg(Idx->getReg(), getUndefRegState(Idx->isUndef())); } + MBB.insert(I, MovRel); - } else { + MI.eraseFromParent(); + return false; + } - assert(AMDGPU::SReg_64RegClass.contains(Save)); - assert(AMDGPU::VGPR_32RegClass.contains(Idx)); + MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); + SaveOp->setIsDead(false); + unsigned Save = SaveOp->getReg(); - // Save the EXEC mask - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) - .addReg(AMDGPU::EXEC); + // Reading from a VGPR requires looping over all workitems in the wavefront. + assert(AMDGPU::SReg_64RegClass.contains(Save) && + AMDGPU::VGPR_32RegClass.contains(Idx->getReg())); - // Read the next variant into VCC (lower 32 bits) <- also loop target - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - AMDGPU::VCC_LO) - .addReg(Idx); + // Save the EXEC mask + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) + .addReg(AMDGPU::EXEC); - // Move index from VCC into M0 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) - .addReg(AMDGPU::VCC_LO); + LivePhysRegs RemainderLiveRegs(TRI); - // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) - .addReg(AMDGPU::M0) - .addReg(Idx); + RemainderLiveRegs.addLiveOuts(MBB); - // Update EXEC, save the original EXEC value to VCC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) - .addReg(AMDGPU::VCC); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; - if (Offset) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) - .addReg(AMDGPU::M0) - .addImm(Offset); - } - // Do the actual move - MBB.insert(I, MovRel); + std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - // Update EXEC, switch all done bits to 0 and all todo bits to 1 - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(AMDGPU::VCC); + for (const MachineInstr &Inst : reverse(*RemainderBB)) + RemainderLiveRegs.stepBackward(Inst); - // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LoopBB->addSuccessor(RemainderBB); + LoopBB->addSuccessor(LoopBB); - // Restore EXEC - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addReg(Save); + splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, + *RemainderBB, Save, *Idx); + + emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); + + MachineBasicBlock::iterator First = RemainderBB->begin(); + BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addReg(Save); - } MI.eraseFromParent(); + return true; } -/// \param @VecReg The register which holds element zero of the vector -/// being addressed into. -/// \param[out] @Reg The base register to use in the indirect addressing instruction. -/// \param[in,out] @Offset As an input, this is the constant offset part of the -// indirect Index. e.g. v0 = v[VecReg + Offset] -// As an output, this is a constant value that needs -// to be added to the value stored in M0. -void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, - unsigned &Reg, - int &Offset) { +/// \param @VecReg The register which holds element zero of the vector being +/// addressed into. +// +/// \param[in] @Idx The index operand from the movrel instruction. This must be +// a register, but may be NoRegister. +/// +/// \param[in] @Offset As an input, this is the constant offset part of the +// indirect Index. e.g. v0 = v[VecReg + Offset] As an output, this is a constant +// value that needs to be added to the value stored in M0. +std::pair<unsigned, int> +SILowerControlFlow::computeIndirectRegAndOffset(unsigned VecReg, int Offset) const { unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0); if (!SubReg) SubReg = VecReg; + const TargetRegisterClass *SuperRC = TRI->getPhysRegClass(VecReg); const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg); - int RegIdx = TRI->getHWRegIndex(SubReg) + Offset; + int NumElts = SuperRC->getSize() / RC->getSize(); + + int BaseRegIdx = TRI->getHWRegIndex(SubReg); + + // Skip out of bounds offsets, or else we would end up using an undefined + // register. + if (Offset >= NumElts) + return std::make_pair(RC->getRegister(BaseRegIdx), Offset); + int RegIdx = BaseRegIdx + Offset; if (RegIdx < 0) { Offset = RegIdx; RegIdx = 0; @@ -420,77 +621,102 @@ void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg, Offset = 0; } - Reg = RC->getRegister(RegIdx); + unsigned Reg = RC->getRegister(RegIdx); + return std::make_pair(Reg, Offset); } -void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { - +// Return true if a new block was inserted. +bool SILowerControlFlow::indirectSrc(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - unsigned Vec = MI.getOperand(2).getReg(); - int Off = MI.getOperand(4).getImm(); + const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - computeIndirectRegAndOffset(Vec, Reg, Off); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(SrcVec->getReg(), Offset); + + const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst) + .addReg(Reg, getUndefRegState(SrcVec->isUndef())); + MI.eraseFromParent(); + return false; + } MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) - .addReg(Reg) - .addReg(Vec, RegState::Implicit); + .addReg(Reg, getUndefRegState(SrcVec->isUndef())) + .addReg(SrcVec->getReg(), RegState::Implicit); - LoadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } -void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { - +// Return true if a new block was inserted. +bool SILowerControlFlow::indirectDst(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + const DebugLoc &DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); - int Off = MI.getOperand(4).getImm(); - unsigned Val = MI.getOperand(5).getReg(); + int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); unsigned Reg; - computeIndirectRegAndOffset(Dst, Reg, Off); + const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); + std::tie(Reg, Offset) = computeIndirectRegAndOffset(Dst, Offset); - MachineInstr *MovRel = - BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) - .addReg(Reg, RegState::Define) - .addReg(Val) - .addReg(Dst, RegState::Implicit); + MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); + if (Idx->getReg() == AMDGPU::NoRegister) { + // Only had a constant offset, copy the register directly. + BuildMI(MBB, MI.getIterator(), DL, TII->get(AMDGPU::V_MOV_B32_e32), Reg) + .addOperand(*Val); + MI.eraseFromParent(); + return false; + } + + MachineInstr *MovRel = + BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32), Reg) + .addReg(Val->getReg(), getUndefRegState(Val->isUndef())) + .addReg(Dst, RegState::Implicit); - LoadM0(MI, MovRel, Off); + return loadM0(MI, MovRel, Offset); } -bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); - TRI = - static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); +bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); bool HaveKill = false; - bool NeedWQM = false; bool NeedFlat = false; unsigned Depth = 0; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { + MachineFunction::iterator NextBB; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; BI = NextBB) { + NextBB = std::next(BI); MachineBasicBlock &MBB = *BI; + + MachineBasicBlock *EmptyMBBAtEnd = nullptr; MachineBasicBlock::iterator I, Next; + bool ExecModified = false; + for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isWQM(MI) || TII->isDS(MI)) - NeedWQM = true; // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI)) NeedFlat = true; + if (I->modifiesRegister(AMDGPU::EXEC, TRI)) + ExecModified = true; + switch (MI.getOpcode()) { default: break; case AMDGPU::SI_IF: @@ -499,7 +725,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { break; case AMDGPU::SI_ELSE: - Else(MI); + Else(MI, ExecModified); break; case AMDGPU::SI_BREAK: @@ -521,16 +747,20 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { - SkipIfDead(MI); HaveKill = false; + // TODO: Insert skip if exec is 0? } + EndCf(MI); break; - case AMDGPU::SI_KILL: - if (Depth == 0) - SkipIfDead(MI); - else + case AMDGPU::SI_KILL_TERMINATOR: + if (Depth == 0) { + if (skipIfDead(MI, *NextBB)) { + NextBB = std::next(BI); + BE = MF.end(); + } + } else HaveKill = true; Kill(MI); break; @@ -544,7 +774,15 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: - IndirectSrc(MI); + if (indirectSrc(MI)) { + // The block was split at this point. We can safely skip the middle + // inserted block to the following which contains the rest of this + // block's instructions. + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + break; case AMDGPU::SI_INDIRECT_DST_V1: @@ -552,55 +790,46 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: - IndirectDst(MI); + if (indirectDst(MI)) { + // The block was split at this point. We can safely skip the middle + // inserted block to the following which contains the rest of this + // block's instructions. + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + break; + + case AMDGPU::SI_RETURN: { + assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); + + // Graphics shaders returning non-void shouldn't contain S_ENDPGM, + // because external bytecode will be appended at the end. + if (BI != --MF.end() || I != MBB.getFirstTerminator()) { + // SI_RETURN is not the last instruction. Add an empty block at + // the end and jump there. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB.addSuccessor(EmptyMBBAtEnd); + BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + I->eraseFromParent(); + } + break; + } } } } - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { - MachineBasicBlock &MBB = MF.front(); - BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC).addReg(AMDGPU::EXEC); - } - - // FIXME: This seems inappropriate to do here. if (NeedFlat && MFI->IsKernel) { - // Insert the prologue initializing the SGPRs pointing to the scratch space - // for flat accesses. - const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - // TODO: What to use with function calls? - - // FIXME: This is reporting stack size that is used in a scratch buffer - // rather than registers as well. - uint64_t StackSizeBytes = FrameInfo->getStackSize(); - - int IndirectBegin - = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); - // Convert register index to 256-byte unit. - uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); - - assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && - "Stack limits should be smaller than 16-bits"); - - // Initialize the flat scratch register pair. - // TODO: Can we use one s_mov_b64 here? - - // Offset is in units of 256-bytes. - MachineBasicBlock &MBB = MF.front(); - DebugLoc NoDL; - MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); - const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); - - assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes)); - - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) - .addImm(StackOffset); - - // Documentation says size is "per-thread scratch size in bytes" - BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) - .addImm(StackSizeBytes); + // We will need to Initialize the flat scratch register pair. + if (NeedFlat) + MFI->setHasFlatInstructions(true); } return true; diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index a2fa5fd93aad..dc1d20ddb274 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -18,7 +18,6 @@ #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" -#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -47,8 +46,6 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineDominatorTree>(); - AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -56,11 +53,8 @@ public: } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, - "SI Lower i1 Copies", false, false) +INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE, + "SI Lower i1 Copies", false, false) char SILowerI1Copies::ID = 0; @@ -72,9 +66,10 @@ FunctionPass *llvm::createSILowerI1CopiesPass() { bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const TargetRegisterInfo *TRI = &TII->getRegisterInfo(); + std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 49677fc2b0a3..4d12a1ef9a93 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -1,19 +1,17 @@ -//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===// +//===-- SIMachineFunctionInfo.cpp -------- SI Machine Function Info -------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -/// \file //===----------------------------------------------------------------------===// - #include "SIMachineFunctionInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -22,6 +20,11 @@ using namespace llvm; +static cl::opt<bool> EnableSpillSGPRToVGPR( + "amdgpu-spill-sgpr-to-vgpr", + cl::desc("Enable spilling VGPRs to SGPRs"), + cl::ReallyHidden, + cl::init(true)); // Pin the vtable to this file. void SIMachineFunctionInfo::anchor() {} @@ -48,12 +51,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), ReturnsVoid(true), + MaximumWorkGroupSize(0), + DebuggerReservedVGPRCount(0), + DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), + DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), LDSWaveSpillSize(0), PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), HasSpilledSGPRs(false), HasSpilledVGPRs(false), + HasNonSpillStackObjects(false), + HasFlatInstructions(false), + NumSpilledSGPRs(0), + NumSpilledVGPRs(0), PrivateSegmentBuffer(false), DispatchPtr(false), QueuePtr(false), @@ -63,37 +74,45 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) GridWorkgroupCountX(false), GridWorkgroupCountY(false), GridWorkgroupCountZ(false), - WorkGroupIDX(true), + WorkGroupIDX(false), WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), PrivateSegmentWaveByteOffset(false), - WorkItemIDX(true), + WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false) { - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); const Function *F = MF.getFunction(); PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - if (getShaderType() == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F->getCallingConv())) { KernargSegmentPtr = true; + WorkGroupIDX = true; + WorkItemIDX = true; + } - if (F->hasFnAttribute("amdgpu-work-group-id-y")) + if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue()) WorkGroupIDY = true; - if (F->hasFnAttribute("amdgpu-work-group-id-z")) + if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue()) WorkGroupIDZ = true; - if (F->hasFnAttribute("amdgpu-work-item-id-y")) + if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue()) WorkItemIDY = true; - if (F->hasFnAttribute("amdgpu-work-item-id-z")) + if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue()) WorkItemIDZ = true; - bool MaySpill = ST.isVGPRSpillingEnabled(this); + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo->hasStackObjects(); if (HasStackObjects || MaySpill) @@ -105,12 +124,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (F->hasFnAttribute("amdgpu-dispatch-ptr")) DispatchPtr = true; + + if (F->hasFnAttribute("amdgpu-queue-ptr")) + QueuePtr = true; } - // X, XY, and XYZ are the only supported combinations, so make sure Y is - // enabled if Z is. - if (WorkItemIDZ) - WorkItemIDY = true; + // We don't need to worry about accessing spills with flat instructions. + // TODO: On VI where we must use flat for global, we should be able to omit + // this if it is never used for generic access. + if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS && + ST.isAmdHsaOS()) + FlatScratchInit = true; + + if (AMDGPU::isCompute(F->getCallingConv())) + MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F); + else + MaximumWorkGroupSize = ST.getWavefrontSize(); + + if (ST.debuggerReserveRegs()) + DebuggerReservedVGPRCount = 4; } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( @@ -142,13 +174,24 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) return KernargSegmentPtrUserSGPR; } -SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( +unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { + FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return FlatScratchInitUserSGPR; +} + +SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg ( MachineFunction *MF, unsigned FrameIndex, unsigned SubIdx) { - const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( - MF->getSubtarget<AMDGPUSubtarget>().getRegisterInfo()); + if (!EnableSpillSGPRToVGPR) + return SpilledReg(); + + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + MachineFrameInfo *FrameInfo = MF->getFrameInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); Offset += SubIdx * 4; @@ -157,19 +200,14 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( unsigned Lane = (Offset / 4) % 64; struct SpilledReg Spill; + Spill.Lane = Lane; if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); - if (LaneVGPR == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("Ran out of VGPRs for spilling SGPR"); - - // When compiling from inside Mesa, the compilation continues. - // Select an arbitrary register to avoid triggering assertions - // during subsequent passes. - LaneVGPR = AMDGPU::VGPR0; - } + if (LaneVGPR == AMDGPU::NoRegister) + // We have no VGPRs left for spilling SGPRs. + return Spill; LaneVGPRs[LaneVGPRIdx] = LaneVGPR; @@ -182,14 +220,10 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( } Spill.VGPR = LaneVGPRs[LaneVGPRIdx]; - Spill.Lane = Lane; return Spill; } unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - // FIXME: We should get this information from kernel attributes if it - // is available. - return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); + return MaximumWorkGroupSize; } diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 846ee5de057d..f5bd6366c717 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -11,12 +11,12 @@ // //===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H -#define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H +#define LLVM_LIB_TARGET_AMDGPU_SIMACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" #include "SIRegisterInfo.h" +#include <array> #include <map> namespace llvm { @@ -25,7 +25,7 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. -class SIMachineFunctionInfo : public AMDGPUMachineFunction { +class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // FIXME: This should be removed and getPreloadedValue moved here. friend struct SIRegisterInfo; void anchor() override; @@ -61,6 +61,15 @@ class SIMachineFunctionInfo : public AMDGPUMachineFunction { unsigned PSInputAddr; bool ReturnsVoid; + unsigned MaximumWorkGroupSize; + + // Number of reserved VGPRs for debugger usage. + unsigned DebuggerReservedVGPRCount; + // Stack object indices for work group IDs. + std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices; + // Stack object indices for work item IDs. + std::array<int, 3> DebuggerWorkItemIDStackObjectIndices; + public: // FIXME: Make private unsigned LDSWaveSpillSize; @@ -73,6 +82,11 @@ public: private: bool HasSpilledSGPRs; bool HasSpilledVGPRs; + bool HasNonSpillStackObjects; + bool HasFlatInstructions; + + unsigned NumSpilledSGPRs; + unsigned NumSpilledVGPRs; // Feature bits required for inputs passed in user SGPRs. bool PrivateSegmentBuffer : 1; @@ -96,7 +110,6 @@ private: bool WorkItemIDY : 1; bool WorkItemIDZ : 1; - MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -111,8 +124,9 @@ public: unsigned VGPR; int Lane; SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) { } - SpilledReg() : VGPR(0), Lane(-1) { } + SpilledReg() : VGPR(AMDGPU::NoRegister), Lane(-1) { } bool hasLane() { return Lane != -1;} + bool hasReg() { return VGPR != AMDGPU::NoRegister;} }; // SIMachineFunctionInfo definition @@ -129,6 +143,7 @@ public: unsigned addDispatchPtr(const SIRegisterInfo &TRI); unsigned addQueuePtr(const SIRegisterInfo &TRI); unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + unsigned addFlatScratchInit(const SIRegisterInfo &TRI); // Add system SGPRs. unsigned addWorkGroupIDX() { @@ -161,6 +176,10 @@ public: return PrivateSegmentWaveByteOffsetSystemSGPR; } + void setPrivateSegmentWaveByteOffset(unsigned Reg) { + PrivateSegmentWaveByteOffsetSystemSGPR = Reg; + } + bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } @@ -261,6 +280,10 @@ public: ScratchWaveOffsetReg = Reg; } + unsigned getQueuePtrUserSGPR() const { + return QueuePtrUserSGPR; + } + bool hasSpilledSGPRs() const { return HasSpilledSGPRs; } @@ -277,6 +300,38 @@ public: HasSpilledVGPRs = Spill; } + bool hasNonSpillStackObjects() const { + return HasNonSpillStackObjects; + } + + void setHasNonSpillStackObjects(bool StackObject = true) { + HasNonSpillStackObjects = StackObject; + } + + bool hasFlatInstructions() const { + return HasFlatInstructions; + } + + void setHasFlatInstructions(bool UseFlat = true) { + HasFlatInstructions = UseFlat; + } + + unsigned getNumSpilledSGPRs() const { + return NumSpilledSGPRs; + } + + unsigned getNumSpilledVGPRs() const { + return NumSpilledVGPRs; + } + + void addToSpilledSGPRs(unsigned num) { + NumSpilledSGPRs += num; + } + + void addToSpilledVGPRs(unsigned num) { + NumSpilledVGPRs += num; + } + unsigned getPSInputAddr() const { return PSInputAddr; } @@ -297,10 +352,70 @@ public: ReturnsVoid = Value; } + /// \returns Number of reserved VGPRs for debugger usage. + unsigned getDebuggerReservedVGPRCount() const { + return DebuggerReservedVGPRCount; + } + + /// \returns Stack object index for \p Dim's work group ID. + int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkGroupIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx. + void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns Stack object index for \p Dim's work item ID. + int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkItemIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx. + void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns SGPR used for \p Dim's work group ID. + unsigned getWorkGroupIDSGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkGroupIDX()); + return WorkGroupIDXSystemSGPR; + case 1: + assert(hasWorkGroupIDY()); + return WorkGroupIDYSystemSGPR; + case 2: + assert(hasWorkGroupIDZ()); + return WorkGroupIDZSystemSGPR; + } + llvm_unreachable("unexpected dimension"); + } + + /// \returns VGPR used for \p Dim' work item ID. + unsigned getWorkItemIDVGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkItemIDX()); + return AMDGPU::VGPR0; + case 1: + assert(hasWorkItemIDY()); + return AMDGPU::VGPR1; + case 2: + assert(hasWorkItemIDZ()); + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected dimension"); + } + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; } // End namespace llvm - #endif diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index 1cfa98430020..7125b411c603 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -12,8 +12,8 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPU.h" #include "SIMachineScheduler.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -295,7 +295,7 @@ static bool isDefBetween(unsigned Reg, const MachineInstr* MI = &*UI; if (MI->isDebugValue()) continue; - SlotIndex InstSlot = LIS->getInstructionIndex(MI).getRegSlot(); + SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); if (InstSlot >= First && InstSlot <= Last) return true; } @@ -327,9 +327,9 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); // Do not Track Physical Registers, because it messes up. - for (unsigned Reg : RPTracker.getPressure().LiveInRegs) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) - LiveInRegs.insert(Reg); + for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { + if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) + LiveInRegs.insert(RegMaskPair.RegUnit); } LiveOutRegs.clear(); // There is several possibilities to distinguish: @@ -354,11 +354,12 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 // The use of findDefBetween removes the case 4. - for (unsigned Reg : RPTracker.getPressure().LiveOutRegs) { + for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { + unsigned Reg = RegMaskPair.RegUnit; if (TargetRegisterInfo::isVirtualRegister(Reg) && - isDefBetween(Reg, LIS->getInstructionIndex(BeginBlock).getRegSlot(), - LIS->getInstructionIndex(EndBlock).getRegSlot(), - MRI, LIS)) { + isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), + LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, + LIS)) { LiveOutRegs.insert(Reg); } } @@ -463,6 +464,9 @@ void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { for (SDep& Succ : SU->Succs) { SUnit *SuccSU = Succ.getSUnit(); + if (SuccSU->NodeNum >= DAG->SUnits.size()) + continue; + if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) continue; @@ -521,12 +525,9 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { } Preds.push_back(Pred); -#ifndef NDEBUG - for (SIScheduleBlock* S : Succs) { - if (PredID == S->getID()) - assert(!"Loop in the Block Graph!\n"); - } -#endif + assert(none_of(Succs, + [=](SIScheduleBlock *S) { return PredID == S->getID(); }) && + "Loop in the Block Graph!"); } void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { @@ -540,12 +541,9 @@ void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) { if (Succ->isHighLatencyBlock()) ++NumHighLatencySuccessors; Succs.push_back(Succ); -#ifndef NDEBUG - for (SIScheduleBlock* P : Preds) { - if (SuccID == P->getID()) - assert("Loop in the Block Graph!\n"); - } -#endif + assert(none_of(Preds, + [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && + "Loop in the Block Graph!"); } #ifndef NDEBUG @@ -712,8 +710,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() { // Traverse TopDown, and give different colors to SUs depending // on which combination of High Latencies they depend on. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->TopDownIndex2SU[i]]; + for (unsigned SUNum : DAG->TopDownIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; // Already given. @@ -754,8 +752,8 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() { // Same as before, but BottomUp. - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; // Already given. @@ -826,8 +824,8 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { unsigned DAGSize = DAG->SUnits.size(); std::vector<int> PendingColoring = CurrentColoring; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; std::set<unsigned> SUColorsPending; @@ -893,8 +891,8 @@ void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { unsigned DAGSize = DAG->SUnits.size(); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -919,8 +917,8 @@ void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { unsigned DAGSize = DAG->SUnits.size(); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -940,8 +938,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { unsigned DAGSize = DAG->SUnits.size(); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; std::set<unsigned> SUColors; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -962,8 +960,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { unsigned DAGSize = DAG->SUnits.size(); std::map<unsigned, unsigned> ColorCount; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; unsigned color = CurrentColoring[SU->NodeNum]; std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color); if (Pos != ColorCount.end()) { @@ -973,8 +971,8 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { } } - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; unsigned color = CurrentColoring[SU->NodeNum]; std::set<unsigned> SUColors; @@ -1006,8 +1004,8 @@ void SIScheduleBlockCreator::regroupNoUserInstructions() { unsigned DAGSize = DAG->SUnits.size(); int GroupID = NextNonReservedID++; - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[DAG->BottomUpIndex2SU[i]]; + for (unsigned SUNum : DAG->BottomUpIndex2SU) { + SUnit *SU = &DAG->SUnits[SUNum]; bool hasSuccessor = false; if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) @@ -1223,7 +1221,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { // is the most cpu intensive operation of the scheduler. // It would gain a lot if there was a way to recompute the // LiveIntervals for the entire scheduling region. - DAG->getLIS()->handleMove(MI, /*UpdateFlags=*/true); + DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); PosNew.push_back(CurrentTopFastSched); } } @@ -1249,7 +1247,7 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { DAG->getBB()->splice(POld, DAG->getBB(), PNew); // Update LiveIntervals. - DAG->getLIS()->handleMove(POld, /*UpdateFlags=*/true); + DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); } } @@ -1675,70 +1673,10 @@ ScheduleDAGInstrs *llvm::createSIMachineScheduler(MachineSchedContext *C) { // Does a topological sort over the SUs. // Both TopDown and BottomUp void SIScheduleDAGMI::topologicalSort() { - std::vector<int> TopDownSU2Index; - unsigned DAGSize = SUnits.size(); - std::vector<SUnit*> WorkList; - - DEBUG(dbgs() << "Topological Sort\n"); - WorkList.reserve(DAGSize); - - TopDownIndex2SU.resize(DAGSize); - TopDownSU2Index.resize(DAGSize); - BottomUpIndex2SU.resize(DAGSize); - - WorkList.push_back(&getExitSU()); - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - int NodeNum = SU->NodeNum; - unsigned Degree = SU->Succs.size(); - TopDownSU2Index[NodeNum] = Degree; - if (Degree == 0) { - assert(SU->Succs.empty() && "SUnit should have no successors"); - WorkList.push_back(SU); - } - } - - int Id = DAGSize; - while (!WorkList.empty()) { - SUnit *SU = WorkList.back(); - WorkList.pop_back(); - if (SU->NodeNum < DAGSize) { - TopDownSU2Index[SU->NodeNum] = --Id; - TopDownIndex2SU[Id] = SU->NodeNum; - } - for (SDep& Pred : SU->Preds) { - SUnit *SU = Pred.getSUnit(); - if (SU->NodeNum < DAGSize && !--TopDownSU2Index[SU->NodeNum]) - WorkList.push_back(SU); - } - } - - BottomUpIndex2SU = std::vector<int>(TopDownIndex2SU.rbegin(), - TopDownIndex2SU.rend()); + Topo.InitDAGTopologicalSorting(); -#ifndef NDEBUG - // Check correctness of the ordering - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SDep& Pred : SU->Preds) { - if (Pred.getSUnit()->NodeNum >= DAGSize) - continue; - assert(TopDownSU2Index[SU->NodeNum] > - TopDownSU2Index[Pred.getSUnit()->NodeNum] && - "Wrong Top Down topological sorting"); - } - } - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SDep& Succ : SU->Succs) { - if (Succ.getSUnit()->NodeNum >= DAGSize) - continue; - assert(TopDownSU2Index[SU->NodeNum] < - TopDownSU2Index[Succ.getSUnit()->NodeNum] && - "Wrong Bottom Up topological sorting"); - } - } -#endif + TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); + BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); } // Move low latencies further from their user without @@ -1759,7 +1697,7 @@ void SIScheduleDAGMI::moveLowLatencies() { for (SDep& PredDep : SU->Preds) { SUnit *Pred = PredDep.getSUnit(); - if (SITII->isLowLatencyInstruction(Pred->getInstr())) { + if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { IsLowLatencyUser = true; } if (Pred->NodeNum >= DAGSize) @@ -1769,7 +1707,7 @@ void SIScheduleDAGMI::moveLowLatencies() { MinPos = PredPos + 1; } - if (SITII->isLowLatencyInstruction(SU->getInstr())) { + if (SITII->isLowLatencyInstruction(*SU->getInstr())) { unsigned BestPos = LastLowLatencyUser + 1; if ((int)BestPos <= LastLowLatencyPos) BestPos = LastLowLatencyPos + 1; @@ -1794,7 +1732,7 @@ void SIScheduleDAGMI::moveLowLatencies() { bool CopyForLowLat = false; for (SDep& SuccDep : SU->Succs) { SUnit *Succ = SuccDep.getSUnit(); - if (SITII->isLowLatencyInstruction(Succ->getInstr())) { + if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { CopyForLowLat = true; } } @@ -1855,7 +1793,6 @@ void SIScheduleDAGMI::schedule() SU.dumpAll(this) ); - Topo.InitDAGTopologicalSorting(); topologicalSort(); findRootsAndBiasEdges(TopRoots, BotRoots); // We reuse several ScheduleDAGMI and ScheduleDAGMILive @@ -1878,20 +1815,21 @@ void SIScheduleDAGMI::schedule() for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { SUnit *SU = &SUnits[i]; - unsigned BaseLatReg, OffLatReg; - if (SITII->isLowLatencyInstruction(SU->getInstr())) { + unsigned BaseLatReg; + int64_t OffLatReg; + if (SITII->isLowLatencyInstruction(*SU->getInstr())) { IsLowLatencySU[i] = 1; - if (SITII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseLatReg, - OffLatReg, TRI)) + if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, + TRI)) LowLatencyOffset[i] = OffLatReg; - } else if (SITII->isHighLatencyInstruction(SU->getInstr())) + } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) IsHighLatencySU[i] = 1; } SIScheduler Scheduler(this); Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); -#if 0 // To enable when handleMove fix lands + // if VGPR usage is extremely high, try other good performing variants // which could lead to lower VGPR usage if (Best.MaxVGPRUsage > 180) { @@ -1930,7 +1868,7 @@ void SIScheduleDAGMI::schedule() Best = Temp; } } -#endif + ScheduledSUnits = Best.SUs; ScheduledSUnitsInv.resize(SUnits.size()); diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h index b270136811c6..117aed497cc2 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/lib/Target/AMDGPU/SIMachineScheduler.h @@ -418,7 +418,7 @@ public: SISchedulerBlockSchedulerVariant ScheduleVariant); }; -class SIScheduleDAGMI : public ScheduleDAGMILive { +class SIScheduleDAGMI final : public ScheduleDAGMILive { const SIInstrInfo *SITII; const SIRegisterInfo *SITRI; @@ -441,7 +441,7 @@ public: // To init Block's RPTracker. void initRPTracker(RegPressureTracker &RPTracker) { - RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin); + RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false); } MachineBasicBlock *getBB() { return BB; } @@ -460,8 +460,10 @@ public: unsigned &VgprUsage, unsigned &SgprUsage); std::set<unsigned> getInRegs() { - std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(), - RPTracker.getPressure().LiveInRegs.end()); + std::set<unsigned> InRegs; + for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { + InRegs.insert(RegMaskPair.RegUnit); + } return InRegs; }; diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 025ed2b5b76b..0dd88ee45c58 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -15,6 +15,7 @@ #include "SIRegisterInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/RegisterScavenging.h" @@ -23,7 +24,75 @@ using namespace llvm; -SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { +static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) { + const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned SIMDPerCU = 4; + + unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize(); + return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) / + MaxInvocationsPerWave; +} + +static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + + unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment; + unsigned ReservedSGPRCount; + + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + TotalSGPRCountPerSIMD = 800; + AddressableSGPRCount = 102; + SGPRUsageAlignment = 16; + ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK + } else { + TotalSGPRCountPerSIMD = 512; + AddressableSGPRCount = 104; + SGPRUsageAlignment = 8; + ReservedSGPRCount = 2; // VCC + } + + unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD); + MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment); + + if (ST.hasSGPRInitBug()) + MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; + + return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount); +} + +static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) { + unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF); + unsigned TotalVGPRCountPerSIMD = 256; + unsigned VGPRUsageAlignment = 4; + + return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD, + VGPRUsageAlignment); +} + +static bool hasPressureSet(const int *PSets, unsigned PSetID) { + for (unsigned i = 0; PSets[i] != -1; ++i) { + if (PSets[i] == (int)PSetID) + return true; + } + return false; +} + +void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg, + BitVector &PressureSets) const { + for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) { + const int *PSets = getRegUnitPressureSets(*U); + if (hasPressureSet(PSets, PSetID)) { + PressureSets.set(PSetID); + break; + } + } +} + +SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(), + SGPRPressureSets(getNumRegPressureSets()), + VGPRPressureSets(getNumRegPressureSets()) { unsigned NumRegPressureSets = getNumRegPressureSets(); SGPR32SetID = NumRegPressureSets; @@ -33,6 +102,9 @@ SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo() { SGPR32SetID = i; else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0) VGPR32SetID = i; + + classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets); + classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets); } assert(SGPR32SetID < NumRegPressureSets && VGPR32SetID < NumRegPressureSets); @@ -47,38 +119,27 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - if (ST.hasSGPRInitBug()) { - // Leave space for flat_scr, xnack_mask, vcc, and alignment - unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4; - unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); - return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and - // 100/101 for vcc. This is the next sgpr128 down. - return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; - } - - return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; + unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); } unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( const MachineFunction &MF) const { - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - if (ST.hasSGPRInitBug()) { - unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1; - return AMDGPU::SGPR_32RegClass.getRegister(Idx); - } - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Next register before reservations for flat_scr, xnack_mask, vcc, - // and scratch resource. - return AMDGPU::SGPR91; + unsigned RegCount = getMaxWorkGroupSGPRCount(MF); + unsigned Reg; + + // Try to place it in a hole after PrivateSegmentbufferReg. + if (RegCount & 3) { + // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to + // alignment constraints, so we have a hole where can put the wave offset. + Reg = RegCount - 1; + } else { + // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the + // wave offset before it. + Reg = RegCount - 5; } - - return AMDGPU::SGPR95; + return AMDGPU::SGPR_32RegClass.getRegister(Reg); } BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -90,35 +151,30 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); - // Reserve the last 2 registers so we will always have at least 2 more that - // will physically contain VCC. - reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); - - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); - - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation - // for VCC/XNACK_MASK/FLAT_SCR. - // - // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose - // SGPRs when the XNACK feature is not used. This is currently not done - // because the code that counts SGPRs cannot account for such holes. - reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97); - reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); - reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + // Reserve Trap Handler registers - support is not implemented in Codegen. + reserveRegisterTuples(Reserved, AMDGPU::TBA); + reserveRegisterTuples(Reserved, AMDGPU::TMA); + reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1); + reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3); + reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5); + reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7); + reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9); + reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11); + + unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF); + unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF); + + unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); + unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs(); + for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) { + unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } - // Tonga and Iceland can only allocate a fixed number of SGPRs due - // to a hw bug. - if (ST.hasSGPRInitBug()) { - unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); - // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs). - unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6; - for (unsigned i = Limit; i < NumSGPRs; ++i) { - unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i); - reserveRegisterTuples(Reserved, Reg); - } + for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); } const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -138,48 +194,182 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + // Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs" + // attribute was specified. + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + if (ST.debuggerReserveRegs()) { + unsigned ReservedVGPRFirst = + MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount(); + for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } + return Reserved; } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &STI = MF.getSubtarget<SISubtarget>(); // FIXME: We should adjust the max number of waves based on LDS size. - unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), - STI.getMaxWavesPerCU()); + unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU()); unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); unsigned VSLimit = SGPRLimit + VGPRLimit; - for (regclass_iterator I = regclass_begin(), E = regclass_end(); - I != E; ++I) { - const TargetRegisterClass *RC = *I; + if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) { + // FIXME: This is a hack. We should never be considering the pressure of + // these since no virtual register should ever have this class. + return VSLimit; + } - unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1); - unsigned Limit; + if (SGPRPressureSets.test(Idx)) + return SGPRLimit; - if (isPseudoRegClass(RC)) { - // FIXME: This is a hack. We should never be considering the pressure of - // these since no virtual register should ever have this class. - Limit = VSLimit; - } else if (isSGPRClass(RC)) { - Limit = SGPRLimit / NumSubRegs; - } else { - Limit = VGPRLimit / NumSubRegs; - } + return VGPRLimit; +} + +bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { + return Fn.getFrameInfo()->hasStackObjects(); +} + +bool +SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects(); +} + +bool SIRegisterInfo::requiresVirtualBaseRegisters( + const MachineFunction &) const { + // There are no special dedicated stack or frame pointers. + return true; +} + +bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { + // This helps catch bugs as verifier errors. + return true; +} + +int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI, + int Idx) const { + if (!SIInstrInfo::isMUBUF(*MI)) + return 0; + + assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::vaddr) && + "Should never see frame index on non-address operand"); + + int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), + AMDGPU::OpName::offset); + return MI->getOperand(OffIdx).getImm(); +} + +bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const { + return MI->mayLoadOrStore(); +} - const int *Sets = getRegClassPressureSets(RC); - assert(Sets); - for (unsigned i = 0; Sets[i] != -1; ++i) { - if (Sets[i] == (int)Idx) - return Limit; +void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, + int FrameIdx, + int64_t Offset) const { + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = Subtarget.getInstrInfo(); + + if (Offset == 0) { + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg) + .addFrameIndex(FrameIdx); + return; + } + + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(Offset); + BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead) + .addReg(OffsetReg, RegState::Kill) + .addFrameIndex(FrameIdx); +} + +void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction *MF = MBB->getParent(); + const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = Subtarget.getInstrInfo(); + +#ifndef NDEBUG + // FIXME: Is it possible to be storing a frame index to itself? + bool SeenFI = false; + for (const MachineOperand &MO: MI.operands()) { + if (MO.isFI()) { + if (SeenFI) + llvm_unreachable("should not see multiple frame indices"); + + SeenFI = true; } } - return 256; +#endif + + MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); + assert(FIOp && FIOp->isFI() && "frame index must be address operand"); + + assert(TII->isMUBUF(MI)); + + MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset); + int64_t NewOffset = OffsetOp->getImm() + Offset; + if (isUInt<12>(NewOffset)) { + // If we have a legal offset, fold it directly into the instruction. + FIOp->ChangeToRegister(BaseReg, false); + OffsetOp->setImm(NewOffset); + return; + } + + // The offset is not legal, so we must insert an add of the offset. + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + DebugLoc DL = MI.getDebugLoc(); + + assert(Offset != 0 && "Non-zero offset expected"); + + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + + // In the case the instruction already had an immediate offset, here only + // the requested new offset is added because we are leaving the original + // immediate in place. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg) + .addImm(Offset); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead) + .addReg(OffsetReg, RegState::Kill) + .addReg(BaseReg); + + FIOp->ChangeToRegister(NewReg, false); } -bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const { - return Fn.getFrameInfo()->hasStackObjects(); +bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + unsigned BaseReg, + int64_t Offset) const { + return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset); +} + +const TargetRegisterClass *SIRegisterInfo::getPointerRegClass( + const MachineFunction &MF, unsigned Kind) const { + // This is inaccurate. It depends on the instruction and address space. The + // only place where we should hit this is for dealing with frame indexes / + // private accesses, so this is correct in that case. + return &AMDGPU::VGPR_32RegClass; } static unsigned getNumSubRegsForSpillOp(unsigned Op) { @@ -219,32 +409,48 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, - unsigned Value, + const MachineOperand *SrcDst, unsigned ScratchRsrcReg, unsigned ScratchOffset, int64_t Offset, RegScavenger *RS) const { + unsigned Value = SrcDst->getReg(); + bool IsKill = SrcDst->isKill(); MachineBasicBlock *MBB = MI->getParent(); - const MachineFunction *MF = MI->getParent()->getParent(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); - LLVMContext &Ctx = MF->getFunction()->getContext(); + MachineFunction *MF = MI->getParent()->getParent(); + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); - bool IsLoad = TII->get(LoadStoreOp).mayLoad(); + bool IsStore = MI->mayStore(); bool RanOutOfSGPRs = false; bool Scavenged = false; unsigned SOffset = ScratchOffset; + unsigned OriginalImmOffset = Offset; unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); unsigned Size = NumSubRegs * 4; if (!isUInt<12>(Offset + Size)) { - SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0); + SOffset = AMDGPU::NoRegister; + + // We don't have access to the register scavenger if this function is called + // during PEI::scavengeFrameVirtualRegs(). + if (RS) + SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass); + if (SOffset == AMDGPU::NoRegister) { + // There are no free SGPRs, and since we are in the process of spilling + // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true + // on SI/CI and on VI it is true until we implement spilling using scalar + // stores), we have no way to free up an SGPR. Our solution here is to + // add the offset directly to the ScratchOffset register, and then + // subtract the offset after the spill to return ScratchOffset to it's + // original value. RanOutOfSGPRs = true; - SOffset = AMDGPU::SGPR0; + SOffset = ScratchOffset; } else { Scavenged = true; } @@ -254,40 +460,48 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, Offset = 0; } - if (RanOutOfSGPRs) - Ctx.emitError("Ran out of SGPRs for spilling VGPRS"); - for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) { unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; unsigned SOffsetRegState = 0; - if (i + 1 == e && Scavenged) - SOffsetRegState |= RegState::Kill; + unsigned SrcDstRegState = getDefRegState(!IsStore); + if (i + 1 == e) { + SOffsetRegState |= getKillRegState(Scavenged); + // The last implicit use carries the "Kill" flag. + SrcDstRegState |= getKillRegState(IsKill); + } BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(SubReg, getDefRegState(!IsStore)) .addReg(ScratchRsrcReg) .addReg(SOffset, SOffsetRegState) .addImm(Offset) .addImm(0) // glc .addImm(0) // slc .addImm(0) // tfe - .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) + .addReg(Value, RegState::Implicit | SrcDstRegState) .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } + if (RanOutOfSGPRs) { + // Subtract the offset we added to the ScratchOffset register. + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset) + .addReg(ScratchOffset) + .addImm(OriginalImmOffset); + } } void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock *MBB = MI->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo()); + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); DebugLoc DL = MI->getDebugLoc(); MachineOperand &FIOp = MI->getOperand(FIOperandNum); @@ -301,24 +515,65 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned SuperReg = MI->getOperand(0).getReg(); + bool IsKill = MI->getOperand(0).isKill(); + // SubReg carries the "Kill" flag when SubReg == SuperReg. + unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { - unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), + unsigned SubReg = getPhysRegSubReg(SuperReg, &AMDGPU::SGPR_32RegClass, i); + struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), - Spill.VGPR) - .addReg(SubReg) - .addImm(Spill.Lane); - - // FIXME: Since this spills to another register instead of an actual - // frame index, we should delete the frame index when all references to - // it are fixed. + if (Spill.hasReg()) { + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32), + Spill.VGPR) + .addReg(SubReg, getKillRegState(IsKill)) + .addImm(Spill.Lane); + + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. + } else { + // Spill SGPR to a frame index. + // FIXME we should use S_STORE_DWORD here for VI. + MachineInstrBuilder Mov + = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) + .addReg(SubReg, SubKillState); + + + // There could be undef components of a spilled super register. + // TODO: Can we detect this and skip the spill? + if (NumSubRegs > 1) { + // The last implicit use of the SuperReg carries the "Kill" flag. + unsigned SuperKillState = 0; + if (i + 1 == e) + SuperKillState |= getKillRegState(IsKill); + Mov.addReg(SuperReg, RegState::Implicit | SuperKillState); + } + + unsigned Size = FrameInfo->getObjectSize(Index); + unsigned Align = FrameInfo->getObjectAlignment(Index); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + Size, Align); + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE)) + .addReg(TmpReg, RegState::Kill) // src + .addFrameIndex(Index) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(i * 4) // offset + .addMemOperand(MMO); + } } MI->eraseFromParent(); + MFI->addToSpilledSGPRs(NumSubRegs); break; } @@ -329,6 +584,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: { unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); for (unsigned i = 0, e = NumSubRegs; i < e; ++i) { unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(), @@ -336,28 +592,37 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, struct SIMachineFunctionInfo::SpilledReg Spill = MFI->getSpilledReg(MF, Index, i); - BuildMI(*MBB, MI, DL, - TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), - SubReg) - .addReg(Spill.VGPR) - .addImm(Spill.Lane) - .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); - } - - // TODO: only do this when it is needed - switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { - case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states - // ("S_NOP 3") on SI - TII->insertWaitStates(MI, 4); - break; - case AMDGPUSubtarget::SEA_ISLANDS: - break; - default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states - // ("S_NOP 4") on VI and later. This also applies to VALUs which write - // VCC, but we're unlikely to see VMEM use VCC. - TII->insertWaitStates(MI, 5); + if (Spill.hasReg()) { + BuildMI(*MBB, MI, DL, + TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), + SubReg) + .addReg(Spill.VGPR) + .addImm(Spill.Lane) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } else { + // Restore SGPR from a stack slot. + // FIXME: We should use S_LOAD_DWORD here for VI. + + unsigned Align = FrameInfo->getObjectAlignment(Index); + unsigned Size = FrameInfo->getObjectSize(Index); + + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, Index); + + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, Size, Align); + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg) + .addFrameIndex(Index) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(i * 4) // offset + .addMemOperand(MMO); + BuildMI(*MBB, MI, DL, + TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg) + .addReg(TmpReg, RegState::Kill) + .addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine); + } } MI->eraseFromParent(); @@ -372,11 +637,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V64_SAVE: case AMDGPU::SI_SPILL_V32_SAVE: buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::src), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); + FrameInfo->getObjectOffset(Index) + + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); MI->eraseFromParent(); + MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); break; case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_V64_RESTORE: @@ -385,10 +652,11 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_V512_RESTORE: { buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET, - TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(), + TII->getNamedOperand(*MI, AMDGPU::OpName::dst), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(), TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(), - FrameInfo->getObjectOffset(Index), RS); + FrameInfo->getObjectOffset(Index) + + TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS); MI->eraseFromParent(); break; } @@ -396,8 +664,8 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, default: { int64_t Offset = FrameInfo->getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); - if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) { - unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, SPAdj); + if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) { + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), TmpReg) .addImm(Offset); @@ -407,10 +675,6 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } } -unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { - return getEncodingValue(Reg) & 0xff; -} - // FIXME: This is very slow. It might be worth creating a map from physreg to // register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { @@ -427,7 +691,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::VReg_256RegClass, &AMDGPU::SReg_256RegClass, &AMDGPU::VReg_512RegClass, - &AMDGPU::SReg_512RegClass + &AMDGPU::SReg_512RegClass, + &AMDGPU::SCC_CLASSRegClass, }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -442,6 +707,8 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { // TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { switch (RC->getSize()) { + case 0: return false; + case 1: return false; case 4: return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; case 8: @@ -479,6 +746,24 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( } } +const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass( + const TargetRegisterClass *VRC) const { + switch (VRC->getSize()) { + case 4: + return &AMDGPU::SGPR_32RegClass; + case 8: + return &AMDGPU::SReg_64RegClass; + case 16: + return &AMDGPU::SReg_128RegClass; + case 32: + return &AMDGPU::SReg_256RegClass; + case 64: + return &AMDGPU::SReg_512RegClass; + default: + llvm_unreachable("Invalid register class size"); + } +} + const TargetRegisterClass *SIRegisterInfo::getSubRegClass( const TargetRegisterClass *RC, unsigned SubIdx) const { if (SubIdx == AMDGPU::NoSubRegister) @@ -552,7 +837,21 @@ unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, switch(Channel) { case 0: return AMDGPU::VCC_LO; case 1: return AMDGPU::VCC_HI; - default: llvm_unreachable("Invalid SubIdx for VCC"); + default: llvm_unreachable("Invalid SubIdx for VCC"); break; + } + + case AMDGPU::TBA: + switch(Channel) { + case 0: return AMDGPU::TBA_LO; + case 1: return AMDGPU::TBA_HI; + default: llvm_unreachable("Invalid SubIdx for TBA"); break; + } + + case AMDGPU::TMA: + switch(Channel) { + case 0: return AMDGPU::TMA_LO; + case 1: return AMDGPU::TMA_HI; + default: llvm_unreachable("Invalid SubIdx for TMA"); break; } case AMDGPU::FLAT_SCR: @@ -610,7 +909,7 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); (void)ST; switch (Value) { case SIRegisterInfo::WORKGROUP_ID_X: @@ -631,11 +930,17 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, case SIRegisterInfo::KERNARG_SEGMENT_PTR: assert(MFI->hasKernargSegmentPtr()); return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_ID: + llvm_unreachable("unimplemented"); + case SIRegisterInfo::FLAT_SCRATCH_INIT: + assert(MFI->hasFlatScratchInit()); + return MFI->FlatScratchInitUserSGPR; case SIRegisterInfo::DISPATCH_PTR: assert(MFI->hasDispatchPtr()); return MFI->DispatchPtrUserSGPR; case SIRegisterInfo::QUEUE_PTR: - llvm_unreachable("not implemented"); + assert(MFI->hasQueuePtr()); + return MFI->QueuePtrUserSGPR; case SIRegisterInfo::WORKITEM_ID_X: assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; @@ -675,9 +980,9 @@ unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const { } } -unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, +unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const { - if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { switch (WaveCount) { case 10: return 80; case 9: return 80; @@ -696,3 +1001,14 @@ unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, } } } + +bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI, + unsigned Reg) const { + const TargetRegisterClass *RC; + if (TargetRegisterInfo::isVirtualRegister(Reg)) + RC = MRI.getRegClass(Reg); + else + RC = getPhysRegClass(Reg); + + return hasVGPRs(RC); +} diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 9410e2049cba..6e97b1b910a9 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -12,23 +12,27 @@ // //===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_R600_SIREGISTERINFO_H -#define LLVM_LIB_TARGET_R600_SIREGISTERINFO_H +#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H +#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" -#include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" namespace llvm { -struct SIRegisterInfo : public AMDGPURegisterInfo { +class SISubtarget; +class MachineRegisterInfo; + +struct SIRegisterInfo final : public AMDGPURegisterInfo { private: unsigned SGPR32SetID; unsigned VGPR32SetID; + BitVector SGPRPressureSets; + BitVector VGPRPressureSets; void reserveRegisterTuples(BitVector &, unsigned Reg) const; + void classifyPressureSet(unsigned PSetID, unsigned Reg, + BitVector &PressureSets) const; public: SIRegisterInfo(); @@ -47,13 +51,39 @@ public: unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override; + bool requiresRegisterScavenging(const MachineFunction &Fn) const override; + + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; + bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override; + bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override; + + int64_t getFrameIndexInstrOffset(const MachineInstr *MI, + int Idx) const override; + + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; + + void materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, int FrameIdx, + int64_t Offset) const override; + + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + + bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, + int64_t Offset) const override; + + const TargetRegisterClass *getPointerRegClass( + const MachineFunction &MF, unsigned Kind = 0) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; - unsigned getHWRegIndex(unsigned Reg) const override; + unsigned getHWRegIndex(unsigned Reg) const { + return getEncodingValue(Reg) & 0xff; + } /// \brief Return the 'base' register class for this register. /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc. @@ -70,9 +100,12 @@ public: } bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + const TargetRegisterClass *RC; if (TargetRegisterInfo::isVirtualRegister(Reg)) - return isSGPRClass(MRI.getRegClass(Reg)); - return getPhysRegClass(Reg); + RC = MRI.getRegClass(Reg); + else + RC = getPhysRegClass(Reg); + return isSGPRClass(RC); } /// \returns true if this class contains VGPR registers. @@ -89,6 +122,10 @@ public: const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; + /// \returns A SGPR reg class with the same width as \p SRC + const TargetRegisterClass *getEquivalentSGPRClass( + const TargetRegisterClass *VRC) const; + /// \returns The register class that is used for a sub-register of \p RC for /// the given \p SubIdx. If \p SubIdx equals NoSubRegister, \p RC will /// be returned. @@ -117,10 +154,12 @@ public: enum PreloadedValue { // SGPRS: - PRIVATE_SEGMENT_BUFFER = 0, + PRIVATE_SEGMENT_BUFFER = 0, DISPATCH_PTR = 1, QUEUE_PTR = 2, KERNARG_SEGMENT_PTR = 3, + DISPATCH_ID = 4, + FLAT_SCRATCH_INIT = 5, WORKGROUP_ID_X = 10, WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, @@ -143,8 +182,7 @@ public: /// \brief Give the maximum number of SGPRs that can be used by \p WaveCount /// concurrent waves. - unsigned getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen, - unsigned WaveCount) const; + unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const; @@ -152,11 +190,14 @@ public: unsigned getSGPR32PressureSet() const { return SGPR32SetID; }; unsigned getVGPR32PressureSet() const { return VGPR32SetID; }; + bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const; + private: void buildScratchLoadStore(MachineBasicBlock::iterator MI, - unsigned LoadStoreOp, unsigned Value, + unsigned LoadStoreOp, const MachineOperand *SrcDst, unsigned ScratchRsrcReg, unsigned ScratchOffset, - int64_t Offset, RegScavenger *RS) const; + int64_t Offset, + RegScavenger *RS) const; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index bfaf93709d8c..c427874d467a 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -44,6 +44,40 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; +// Trap handler registers +def TBA_LO : SIReg<"tba_lo", 108>; +def TBA_HI : SIReg<"tba_hi", 109>; + +def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>, + DwarfRegAlias<TBA_LO> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 108; +} + +def TMA_LO : SIReg<"tma_lo", 110>; +def TMA_HI : SIReg<"tma_hi", 111>; + +def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>, + DwarfRegAlias<TMA_LO> { + let Namespace = "AMDGPU"; + let SubRegIndices = [sub0, sub1]; + let HWEncoding = 110; +} + +def TTMP0 : SIReg <"ttmp0", 112>; +def TTMP1 : SIReg <"ttmp1", 113>; +def TTMP2 : SIReg <"ttmp2", 114>; +def TTMP3 : SIReg <"ttmp3", 115>; +def TTMP4 : SIReg <"ttmp4", 116>; +def TTMP5 : SIReg <"ttmp5", 117>; +def TTMP6 : SIReg <"ttmp6", 118>; +def TTMP7 : SIReg <"ttmp7", 119>; +def TTMP8 : SIReg <"ttmp8", 120>; +def TTMP9 : SIReg <"ttmp9", 121>; +def TTMP10 : SIReg <"ttmp10", 122>; +def TTMP11 : SIReg <"ttmp11", 123>; + multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { def _ci : SIReg<n, ci_e>; def _vi : SIReg<n, vi_e>; @@ -81,11 +115,18 @@ foreach Index = 0-255 in { // Groupings using register classes and tuples //===----------------------------------------------------------------------===// +def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> { + let CopyCost = -1; + let isAllocatable = 0; +} + // TODO: Do we need to set DwarfRegAlias on register tuples? // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 103))>; + (add (sequence "SGPR%u", 0, 103))> { + let AllocationPriority = 1; +} // SGPR 64-bit registers def SGPR_64Regs : RegisterTuples<[sub0, sub1], @@ -93,7 +134,7 @@ def SGPR_64Regs : RegisterTuples<[sub0, sub1], (add (decimate (shl SGPR_32, 1), 2))]>; // SGPR 128-bit registers -def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], +def SGPR_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), @@ -130,9 +171,29 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, (add (decimate (shl SGPR_32, 14), 4)), (add (decimate (shl SGPR_32, 15), 4))]>; +// Trap handler TMP 32-bit registers +def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add (sequence "TTMP%u", 0, 11))> { + let isAllocatable = 0; +} + +// Trap handler TMP 64-bit registers +def TTMP_64Regs : RegisterTuples<[sub0, sub1], + [(add (decimate TTMP_32, 2)), + (add (decimate (shl TTMP_32, 1), 2))]>; + +// Trap handler TMP 128-bit registers +def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3], + [(add (decimate TTMP_32, 4)), + (add (decimate (shl TTMP_32, 1), 4)), + (add (decimate (shl TTMP_32, 2), 4)), + (add (decimate (shl TTMP_32, 3), 4))]>; + // VGPR 32-bit registers def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "VGPR%u", 0, 255))>; + (add (sequence "VGPR%u", 0, 255))> { + let AllocationPriority = 1; +} // VGPR 64-bit registers def VGPR_64 : RegisterTuples<[sub0, sub1], @@ -192,36 +253,67 @@ class RegImmMatcher<string name> : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } +// Subset of SReg_32 without M0 for SMRD instructions and alike. +// See comments in SIInstructions.td for more info. +def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32], 32, + (add SGPR_32, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI, + TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> { + let AllocationPriority = 1; +} + // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) ->; + (add SReg_32_XM0, M0)> { + let AllocationPriority = 1; +} + +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> { + let AllocationPriority = 2; +} -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; +def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> { + let isAllocatable = 0; +} def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, - (add SGPR_64, VCC, EXEC, FLAT_SCR) ->; + (add SGPR_64, VCC, EXEC, FLAT_SCR, TTMP_64, TBA, TMA)> { + let AllocationPriority = 2; +} -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { - // Requires 2 s_mov_b64 to copy - let CopyCost = 2; +// Requires 2 s_mov_b64 to copy +let CopyCost = 2 in { + +def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128Regs)> { + let AllocationPriority = 4; +} + +def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128Regs)> { + let isAllocatable = 0; +} + +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { + let AllocationPriority = 4; } -def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> { +} // End CopyCost = 2 + +def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; + let AllocationPriority = 5; } def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { // Requires 8 s_mov_b64 to copy let CopyCost = 8; + let AllocationPriority = 6; } // Register class for all vector registers (VGPRs + Interploation Registers) def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { // Requires 2 v_mov_b32 to copy let CopyCost = 2; + let AllocationPriority = 2; } def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { @@ -229,19 +321,23 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { // Requires 3 v_mov_b32 to copy let CopyCost = 3; + let AllocationPriority = 3; } def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { // Requires 4 v_mov_b32 to copy let CopyCost = 4; + let AllocationPriority = 4; } -def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> { +def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add VGPR_256)> { let CopyCost = 8; + let AllocationPriority = 5; } def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { let CopyCost = 16; + let AllocationPriority = 6; } def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { diff --git a/lib/Target/AMDGPU/SISchedule.td b/lib/Target/AMDGPU/SISchedule.td index cd77e519abb2..ed19217226b8 100644 --- a/lib/Target/AMDGPU/SISchedule.td +++ b/lib/Target/AMDGPU/SISchedule.td @@ -11,6 +11,12 @@ // //===----------------------------------------------------------------------===// +def : PredicateProlog<[{ + const SIInstrInfo *TII = + static_cast<const SIInstrInfo*>(SchedModel->getInstrInfo()); + (void)TII; +}]>; + def WriteBranch : SchedWrite; def WriteExport : SchedWrite; def WriteLDS : SchedWrite; @@ -39,20 +45,33 @@ def Write64Bit : SchedWrite; // instructions and have VALU rates, but write to the SALU (i.e. VOPC // instructions) -def SIFullSpeedModel : SchedMachineModel; -def SIQuarterSpeedModel : SchedMachineModel; +class SISchedMachineModel : SchedMachineModel { + let CompleteModel = 0; + let IssueWidth = 1; + let PostRAScheduler = 1; +} -// BufferSize = 0 means the processors are in-order. -let BufferSize = 0 in { +def SIFullSpeedModel : SISchedMachineModel; +def SIQuarterSpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? -def HWBranch : ProcResource<1>; -def HWExport : ProcResource<7>; // Taken from S_WAITCNT -def HWLGKM : ProcResource<31>; // Taken from S_WAITCNT -def HWSALU : ProcResource<1>; -def HWVMEM : ProcResource<15>; // Taken from S_WAITCNT -def HWVALU : ProcResource<1>; - +def HWBranch : ProcResource<1> { + let BufferSize = 1; +} +def HWExport : ProcResource<1> { + let BufferSize = 7; // Taken from S_WAITCNT +} +def HWLGKM : ProcResource<1> { + let BufferSize = 31; // Taken from S_WAITCNT +} +def HWSALU : ProcResource<1> { + let BufferSize = 1; +} +def HWVMEM : ProcResource<1> { + let BufferSize = 15; // Taken from S_WAITCNT +} +def HWVALU : ProcResource<1> { + let BufferSize = 1; } class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources, @@ -70,12 +89,12 @@ class HWVALUWriteRes<SchedWrite write, int latency> : // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { - def : HWWriteRes<WriteBranch, [HWBranch], 100>; // XXX: Guessed ??? - def : HWWriteRes<WriteExport, [HWExport], 100>; // XXX: Guessed ??? - def : HWWriteRes<WriteLDS, [HWLGKM], 32>; // 2 - 64 - def : HWWriteRes<WriteSALU, [HWSALU], 1>; - def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? - def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + def : HWWriteRes<WriteBranch, [HWBranch], 8>; + def : HWWriteRes<WriteExport, [HWExport], 4>; + def : HWWriteRes<WriteLDS, [HWLGKM], 5>; // Can be between 2 and 64 + def : HWWriteRes<WriteSALU, [HWSALU], 1>; + def : HWWriteRes<WriteSMEM, [HWLGKM], 5>; + def : HWWriteRes<WriteVMEM, [HWVMEM], 80>; def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? def : HWVALUWriteRes<Write32Bit, 1>; @@ -83,6 +102,12 @@ multiclass SICommonWriteRes { def : HWVALUWriteRes<WriteQuarterRate32, 4>; } +def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>; +def PredIsVGPR64Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) > 32}]>; +def WriteCopy : SchedWriteVariant<[ + SchedVar<PredIsVGPR32Copy, [Write32Bit]>, + SchedVar<PredIsVGPR64Copy, [Write64Bit]>, + SchedVar<NoSchedPred, [WriteSALU]>]>; let SchedModel = SIFullSpeedModel in { @@ -92,6 +117,8 @@ def : HWVALUWriteRes<WriteFloatFMA, 1>; def : HWVALUWriteRes<WriteDouble, 4>; def : HWVALUWriteRes<WriteDoubleAdd, 2>; +def : InstRW<[WriteCopy], (instrs COPY)>; + } // End SchedModel = SIFullSpeedModel let SchedModel = SIQuarterSpeedModel in { @@ -102,4 +129,6 @@ def : HWVALUWriteRes<WriteFloatFMA, 16>; def : HWVALUWriteRes<WriteDouble, 16>; def : HWVALUWriteRes<WriteDoubleAdd, 8>; +def : InstRW<[WriteCopy], (instrs COPY)>; + } // End SchedModel = SIQuarterSpeedModel diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 4f0913fe62f2..6cba55300a8c 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -31,10 +31,6 @@ STATISTIC(NumInstructionsShrunk, STATISTIC(NumLiteralConstantsFolded, "Number of literal constants folded into 32-bit instructions."); -namespace llvm { - void initializeSIShrinkInstructionsPass(PassRegistry&); -} - using namespace llvm; namespace { @@ -61,10 +57,8 @@ public: } // End anonymous namespace. -INITIALIZE_PASS_BEGIN(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) -INITIALIZE_PASS_END(SIShrinkInstructions, DEBUG_TYPE, - "SI Lower il Copies", false, false) +INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE, + "SI Shrink Instructions", false, false) char SIShrinkInstructions::ID = 0; @@ -125,10 +119,7 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) return false; - if (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) - return false; - - return true; + return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); } /// \brief This function checks \p MI for operands defined by a move immediate @@ -181,31 +172,37 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(&MI)) + if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI)) foldImmediates(MI, TII, MRI, false); } // Copy MachineOperand with all flags except setting it as implicit. -static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { - assert(!Orig.isImplicit()); - return MachineOperand::CreateReg(Orig.getReg(), - Orig.isDef(), - true, - Orig.isKill(), - Orig.isDead(), - Orig.isUndef(), - Orig.isEarlyClobber(), - Orig.getSubReg(), - Orig.isDebug(), - Orig.isInternalRead()); +static void copyFlagsToImplicitVCC(MachineInstr &MI, + const MachineOperand &Orig) { + + for (MachineOperand &Use : MI.implicit_operands()) { + if (Use.getReg() == AMDGPU::VCC) { + Use.setIsUndef(Orig.isUndef()); + Use.setIsKill(Orig.isKill()); + return; + } + } +} + +static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { + return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4); } bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + std::vector<unsigned> I1Defs; for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); @@ -217,14 +214,94 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; + if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { + // If this has a literal constant source that is the same as the + // reversed bits of an inline immediate, replace with a bitreverse of + // that constant. This saves 4 bytes in the common case of materializing + // sign bits. + + // Test if we are after regalloc. We only want to do this after any + // optimizations happen because this will confuse them. + // XXX - not exactly a check for post-regalloc run. + MachineOperand &Src = MI.getOperand(1); + if (Src.isImm() && + TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) { + int64_t Imm = Src.getImm(); + if (isInt<32>(Imm) && !TII->isInlineConstant(Src, 4)) { + int32_t ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Imm)); + if (ReverseImm >= -16 && ReverseImm <= 64) { + MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32)); + Src.setImm(ReverseImm); + continue; + } + } + } + } + + // Combine adjacent s_nops to use the immediate operand encoding how long + // to wait. + // + // s_nop N + // s_nop M + // => + // s_nop (N + M) + if (MI.getOpcode() == AMDGPU::S_NOP && + Next != MBB.end() && + (*Next).getOpcode() == AMDGPU::S_NOP) { + + MachineInstr &NextMI = *Next; + // The instruction encodes the amount to wait with an offset of 1, + // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back + // after adding. + uint8_t Nop0 = MI.getOperand(0).getImm() + 1; + uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; + + // Make sure we don't overflow the bounds. + if (Nop0 + Nop1 <= 8) { + NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); + MI.eraseFromParent(); + } + + continue; + } + + // FIXME: We also need to consider movs of constant operands since + // immediate operands are not folded if they have more than one use, and + // the operand folding pass is unaware if the immediate will be free since + // it won't know if the src == dest constraint will end up being + // satisfied. + if (MI.getOpcode() == AMDGPU::S_ADD_I32 || + MI.getOpcode() == AMDGPU::S_MUL_I32) { + const MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Src0 = MI.getOperand(1); + const MachineOperand &Src1 = MI.getOperand(2); + + // FIXME: This could work better if hints worked with subregisters. If + // we have a vector add of a constant, we usually don't get the correct + // allocation due to the subregister usage. + if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) && + Src0.isReg()) { + MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg()); + continue; + } + + if (Src0.isReg() && Src0.getReg() == Dest.getReg()) { + if (Src1.isImm() && isKImmOperand(TII, Src1)) { + unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? + AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; + + MI.setDesc(TII->get(Opc)); + MI.tieOperands(0, 1); + } + } + } + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. if (MI.getOpcode() == AMDGPU::S_MOV_B32) { const MachineOperand &Src = MI.getOperand(1); - if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) - MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - } + if (Src.isImm() && isKImmOperand(TII, Src)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); continue; } @@ -235,7 +312,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (!canShrink(MI, TII, TRI, MRI)) { // Try commuting the instruction and see if that enables us to shrink // it. - if (!MI.isCommutable() || !TII->commuteInstruction(&MI) || + if (!MI.isCommutable() || !TII->commuteInstruction(MI) || !canShrink(MI, TII, TRI, MRI)) continue; } @@ -287,9 +364,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - // Add the dst operand if the 32-bit encoding also has an explicit $dst. + // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. - int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst); if (Op32DstIdx != -1) { // dst Inst32.addOperand(MI.getOperand(0)); @@ -314,10 +391,9 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Inst32.addOperand(*Src2); } else { // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is - // replaced with an implicit read of vcc. - assert(Src2->getReg() == AMDGPU::VCC && - "Unexpected missing register operand"); - Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); + // replaced with an implicit read of vcc. This was already added + // during the initial BuildMI, so find it to preserve the flags. + copyFlagsToImplicitVCC(*Inst32, *Src2); } } diff --git a/lib/Target/AMDGPU/SITypeRewriter.cpp b/lib/Target/AMDGPU/SITypeRewriter.cpp index d36c5d29b127..facc0c7df1dc 100644 --- a/lib/Target/AMDGPU/SITypeRewriter.cpp +++ b/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -62,7 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) { } bool SITypeRewriter::runOnFunction(Function &F) { - if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F.getCallingConv())) return false; visit(F); diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp new file mode 100644 index 000000000000..c1a237ea5f51 --- /dev/null +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -0,0 +1,509 @@ +//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass adds instructions to enable whole quad mode for pixel +/// shaders. +/// +/// Whole quad mode is required for derivative computations, but it interferes +/// with shader side effects (stores and atomics). This pass is run on the +/// scheduled machine IR but before register coalescing, so that machine SSA is +/// available for analysis. It ensures that WQM is enabled when necessary, but +/// disabled around stores and atomics. +/// +/// When necessary, this pass creates a function prolog +/// +/// S_MOV_B64 LiveMask, EXEC +/// S_WQM_B64 EXEC, EXEC +/// +/// to enter WQM at the top of the function and surrounds blocks of Exact +/// instructions by +/// +/// S_AND_SAVEEXEC_B64 Tmp, LiveMask +/// ... +/// S_MOV_B64 EXEC, Tmp +/// +/// In order to avoid excessive switching during sequences of Exact +/// instructions, the pass first analyzes which instructions must be run in WQM +/// (aka which instructions produce values that lead to derivative +/// computations). +/// +/// Basic blocks are always exited in WQM as long as some successor needs WQM. +/// +/// There is room for improvement given better control flow analysis: +/// +/// (1) at the top level (outside of control flow statements, and as long as +/// kill hasn't been used), one SGPR can be saved by recovering WQM from +/// the LiveMask (this is implemented for the entry block). +/// +/// (2) when entire regions (e.g. if-else blocks or entire loops) only +/// consist of exact and don't-care instructions, the switch only has to +/// be done at the entry and exit points rather than potentially in each +/// block of the region. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-wqm" + +namespace { + +enum { + StateWQM = 0x1, + StateExact = 0x2, +}; + +struct InstrInfo { + char Needs = 0; + char OutNeeds = 0; +}; + +struct BlockInfo { + char Needs = 0; + char InNeeds = 0; + char OutNeeds = 0; +}; + +struct WorkItem { + MachineBasicBlock *MBB = nullptr; + MachineInstr *MI = nullptr; + + WorkItem() {} + WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {} + WorkItem(MachineInstr *MI) : MI(MI) {} +}; + +class SIWholeQuadMode : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + DenseMap<const MachineInstr *, InstrInfo> Instructions; + DenseMap<MachineBasicBlock *, BlockInfo> Blocks; + SmallVector<const MachineInstr *, 2> ExecExports; + SmallVector<MachineInstr *, 1> LiveMaskQueries; + + char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist); + void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist); + void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist); + char analyzeFunction(MachineFunction &MF); + + void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SaveWQM, unsigned LiveMaskReg); + void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SavedWQM); + void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry); + + void lowerLiveMaskQueries(unsigned LiveMaskReg); + +public: + static char ID; + + SIWholeQuadMode() : + MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Whole Quad Mode"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +char SIWholeQuadMode::ID = 0; + +INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE, + "SI Whole Quad Mode", false, false) + +char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; + +FunctionPass *llvm::createSIWholeQuadModePass() { + return new SIWholeQuadMode; +} + +// Scan instructions to determine which ones require an Exact execmask and +// which ones seed WQM requirements. +char SIWholeQuadMode::scanInstructions(MachineFunction &MF, + std::vector<WorkItem> &Worklist) { + char GlobalFlags = 0; + bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs"); + + for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { + MachineBasicBlock &MBB = *BI; + + for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { + MachineInstr &MI = *II; + unsigned Opcode = MI.getOpcode(); + char Flags = 0; + + if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { + Flags = StateWQM; + } else if (MI.mayStore() && TII->usesVM_CNT(MI)) { + Flags = StateExact; + } else { + // Handle export instructions with the exec mask valid flag set + if (Opcode == AMDGPU::EXP) { + if (MI.getOperand(4).getImm() != 0) + ExecExports.push_back(&MI); + } else if (Opcode == AMDGPU::SI_PS_LIVE) { + LiveMaskQueries.push_back(&MI); + } else if (WQMOutputs) { + // The function is in machine SSA form, which means that physical + // VGPRs correspond to shader inputs and outputs. Inputs are + // only used, outputs are only defined. + for (const MachineOperand &MO : MI.defs()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + + if (!TRI->isVirtualRegister(Reg) && + TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) { + Flags = StateWQM; + break; + } + } + } + + if (!Flags) + continue; + } + + Instructions[&MI].Needs = Flags; + Worklist.push_back(&MI); + GlobalFlags |= Flags; + } + + if (WQMOutputs && MBB.succ_empty()) { + // This is a prolog shader. Make sure we go back to exact mode at the end. + Blocks[&MBB].OutNeeds = StateExact; + Worklist.push_back(&MBB); + GlobalFlags |= StateExact; + } + } + + return GlobalFlags; +} + +void SIWholeQuadMode::propagateInstruction(MachineInstr &MI, + std::vector<WorkItem>& Worklist) { + MachineBasicBlock *MBB = MI.getParent(); + InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references + BlockInfo &BI = Blocks[MBB]; + + // Control flow-type instructions that are followed by WQM computations + // must themselves be in WQM. + if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) { + Instructions[&MI].Needs = StateWQM; + II.Needs = StateWQM; + } + + // Propagate to block level + BI.Needs |= II.Needs; + if ((BI.InNeeds | II.Needs) != BI.InNeeds) { + BI.InNeeds |= II.Needs; + Worklist.push_back(MBB); + } + + // Propagate backwards within block + if (MachineInstr *PrevMI = MI.getPrevNode()) { + char InNeeds = II.Needs | II.OutNeeds; + if (!PrevMI->isPHI()) { + InstrInfo &PrevII = Instructions[PrevMI]; + if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { + PrevII.OutNeeds |= InNeeds; + Worklist.push_back(PrevMI); + } + } + } + + // Propagate WQM flag to instruction inputs + assert(II.Needs != (StateWQM | StateExact)); + if (II.Needs != StateWQM) + return; + + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) + continue; + + // At this point, physical registers appear as inputs or outputs + // and following them makes no sense (and would in fact be incorrect + // when the same VGPR is used as both an output and an input that leads + // to a NeedsWQM instruction). + // + // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we + // have to trace this, in practice it happens for 64-bit computations like + // pointers where both dwords are followed already anyway. + if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) + continue; + + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) { + InstrInfo &DefII = Instructions[&DefMI]; + + // Obviously skip if DefMI is already flagged as NeedWQM. + // + // The instruction might also be flagged as NeedExact. This happens when + // the result of an atomic is used in a WQM computation. In this case, + // the atomic must not run for helper pixels and the WQM result is + // undefined. + if (DefII.Needs != 0) + continue; + + DefII.Needs = StateWQM; + Worklist.push_back(&DefMI); + } + } +} + +void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB, + std::vector<WorkItem>& Worklist) { + BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references. + + // Propagate through instructions + if (!MBB.empty()) { + MachineInstr *LastMI = &*MBB.rbegin(); + InstrInfo &LastII = Instructions[LastMI]; + if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { + LastII.OutNeeds |= BI.OutNeeds; + Worklist.push_back(LastMI); + } + } + + // Predecessor blocks must provide for our WQM/Exact needs. + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds) + continue; + + PredBI.OutNeeds |= BI.InNeeds; + PredBI.InNeeds |= BI.InNeeds; + Worklist.push_back(Pred); + } + + // All successors must be prepared to accept the same set of WQM/Exact data. + for (MachineBasicBlock *Succ : MBB.successors()) { + BlockInfo &SuccBI = Blocks[Succ]; + if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds) + continue; + + SuccBI.InNeeds |= BI.OutNeeds; + Worklist.push_back(Succ); + } +} + +char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) { + std::vector<WorkItem> Worklist; + char GlobalFlags = scanInstructions(MF, Worklist); + + while (!Worklist.empty()) { + WorkItem WI = Worklist.back(); + Worklist.pop_back(); + + if (WI.MI) + propagateInstruction(*WI.MI, Worklist); + else + propagateBlock(*WI.MBB, Worklist); + } + + return GlobalFlags; +} + +void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SaveWQM, unsigned LiveMaskReg) { + if (SaveWQM) { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + SaveWQM) + .addReg(LiveMaskReg); + } else { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(LiveMaskReg); + } +} + +void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SavedWQM) { + if (SavedWQM) { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedWQM); + } else { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + } +} + +void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, + bool isEntry) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + const BlockInfo &BI = BII->second; + + if (!(BI.InNeeds & StateWQM)) + return; + + // This is a non-entry block that is WQM throughout, so no need to do + // anything. + if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + return; + + unsigned SavedWQMReg = 0; + bool WQMFromExec = isEntry; + char State = isEntry ? StateExact : StateWQM; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + MachineInstr &MI = *II; + ++II; + + // Skip instructions that are not affected by EXEC + if (TII->isScalarUnit(MI) && !MI.isTerminator()) + continue; + + // Generic instructions such as COPY will either disappear by register + // coalescing or be lowered to SALU or VALU instructions. + if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) { + if (MI.getNumExplicitOperands() >= 1) { + const MachineOperand &Op = MI.getOperand(0); + if (Op.isReg()) { + if (TRI->isSGPRReg(*MRI, Op.getReg())) { + // SGPR instructions are not affected by EXEC + continue; + } + } + } + } + + char Needs = 0; + char OutNeeds = 0; + auto InstrInfoIt = Instructions.find(&MI); + if (InstrInfoIt != Instructions.end()) { + Needs = InstrInfoIt->second.Needs; + OutNeeds = InstrInfoIt->second.OutNeeds; + + // Make sure to switch to Exact mode before the end of the block when + // Exact and only Exact is needed further downstream. + if (OutNeeds == StateExact && MI.isTerminator()) { + assert(Needs == 0); + Needs = StateExact; + } + } + + // State switching + if (Needs && State != Needs) { + if (Needs == StateExact) { + assert(!SavedWQMReg); + + if (!WQMFromExec && (OutNeeds & StateWQM)) + SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); + } else { + assert(WQMFromExec == (SavedWQMReg == 0)); + toWQM(MBB, &MI, SavedWQMReg); + SavedWQMReg = 0; + } + + State = Needs; + } + } + + if ((BI.OutNeeds & StateWQM) && State != StateWQM) { + assert(WQMFromExec == (SavedWQMReg == 0)); + toWQM(MBB, MBB.end(), SavedWQMReg); + } else if (BI.OutNeeds == StateExact && State != StateExact) { + toExact(MBB, MBB.end(), 0, LiveMaskReg); + } +} + +void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) { + for (MachineInstr *MI : LiveMaskQueries) { + const DebugLoc &DL = MI->getDebugLoc(); + unsigned Dest = MI->getOperand(0).getReg(); + BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest) + .addReg(LiveMaskReg); + MI->eraseFromParent(); + } +} + +bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { + if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) + return false; + + Instructions.clear(); + Blocks.clear(); + ExecExports.clear(); + LiveMaskQueries.clear(); + + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + + TII = ST.getInstrInfo(); + TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); + + char GlobalFlags = analyzeFunction(MF); + if (!(GlobalFlags & StateWQM)) { + lowerLiveMaskQueries(AMDGPU::EXEC); + return !LiveMaskQueries.empty(); + } + + // Store a copy of the original live mask when required + unsigned LiveMaskReg = 0; + { + MachineBasicBlock &Entry = MF.front(); + MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); + + if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) { + LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg) + .addReg(AMDGPU::EXEC); + } + + if (GlobalFlags == StateWQM) { + // For a shader that needs only WQM, we can just set it once. + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + + lowerLiveMaskQueries(LiveMaskReg); + // EntryMI may become invalid here + return true; + } + } + + lowerLiveMaskQueries(LiveMaskReg); + + // Handle the general case + for (auto BII : Blocks) + processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin()); + + return true; +} diff --git a/lib/Target/AMDGPU/TargetInfo/Makefile b/lib/Target/AMDGPU/TargetInfo/Makefile deleted file mode 100644 index 1b232871bd62..000000000000 --- a/lib/Target/AMDGPU/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUInfo - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp new file mode 100644 index 000000000000..b6868de6a74e --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -0,0 +1,69 @@ +//===-- AMDGPUAsmUtils.cpp - AsmParser/InstPrinter common -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +#include "AMDGPUAsmUtils.h" + +namespace llvm { +namespace AMDGPU { +namespace SendMsg { + +// This must be in sync with llvm::AMDGPU::SendMsg::Id enum members, see SIDefines.h. +const char* const IdSymbolic[] = { + nullptr, + "MSG_INTERRUPT", + "MSG_GS", + "MSG_GS_DONE", + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + "MSG_SYSMSG" +}; + +// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h. +const char* const OpSysSymbolic[] = { + nullptr, + "SYSMSG_OP_ECC_ERR_INTERRUPT", + "SYSMSG_OP_REG_RD", + "SYSMSG_OP_HOST_TRAP_ACK", + "SYSMSG_OP_TTRACE_PC" +}; + +const char* const OpGsSymbolic[] = { + "GS_OP_NOP", + "GS_OP_CUT", + "GS_OP_EMIT", + "GS_OP_EMIT_CUT" +}; + +} // namespace SendMsg + +namespace Hwreg { + +// This must be in sync with llvm::AMDGPU::Hwreg::ID_SYMBOLIC_FIRST_/LAST_, see SIDefines.h. +const char* const IdSymbolic[] = { + nullptr, + "HW_REG_MODE", + "HW_REG_STATUS", + "HW_REG_TRAPSTS", + "HW_REG_HW_ID", + "HW_REG_GPR_ALLOC", + "HW_REG_LDS_ALLOC", + "HW_REG_IB_STS" +}; + +} // namespace Hwreg +} // namespace AMDGPU +} // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h new file mode 100644 index 000000000000..b2dc2c0e364c --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h @@ -0,0 +1,31 @@ +//===-- AMDGPUAsmUtils.h - AsmParser/InstPrinter common ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUASMUTILS_H + +namespace llvm { +namespace AMDGPU { +namespace SendMsg { // Symbolic names for the sendmsg(...) syntax. + +extern const char* const IdSymbolic[]; +extern const char* const OpSysSymbolic[]; +extern const char* const OpGsSymbolic[]; + +} // namespace SendMsg + +namespace Hwreg { // Symbolic names for the hwreg(...) syntax. + +extern const char* const IdSymbolic[]; + +} // namespace Hwreg +} // namespace AMDGPU +} // namespace llvm + +#endif diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 1f5deaef9d3b..c6f9142c0aa5 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -109,29 +109,45 @@ bool isReadOnlySegment(const GlobalValue *GV) { return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } -static unsigned getIntegerAttribute(const Function &F, const char *Name, - unsigned Default) { +int getIntegerAttribute(const Function &F, StringRef Name, int Default) { Attribute A = F.getFnAttribute(Name); - unsigned Result = Default; + int Result = Default; if (A.isStringAttribute()) { StringRef Str = A.getValueAsString(); if (Str.getAsInteger(0, Result)) { LLVMContext &Ctx = F.getContext(); - Ctx.emitError("can't parse shader type"); + Ctx.emitError("can't parse integer attribute " + Name); } } + return Result; } -unsigned getShaderType(const Function &F) { - return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); +unsigned getMaximumWorkGroupSize(const Function &F) { + return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256); } unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } +bool isShader(CallingConv::ID cc) { + switch(cc) { + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return true; + default: + return false; + } +} + +bool isCompute(CallingConv::ID cc) { + return !isShader(cc) || cc == CallingConv::AMDGPU_CS; +} + bool isSI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; } diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 57cbe1b58f98..995a9041fb36 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -11,6 +11,7 @@ #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H #include "AMDKernelCodeT.h" +#include "llvm/IR/CallingConv.h" namespace llvm { @@ -44,9 +45,13 @@ bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); -unsigned getShaderType(const Function &F); +int getIntegerAttribute(const Function &F, StringRef Name, int Default); + +unsigned getMaximumWorkGroupSize(const Function &F); unsigned getInitialPSInputAddr(const Function &F); +bool isShader(CallingConv::ID cc); +bool isCompute(CallingConv::ID cc); bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h new file mode 100644 index 000000000000..3a5ff60601d0 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -0,0 +1,165 @@ +//===--------------------- AMDKernelCodeTInfo.h ---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +/// \file - specifies tables for amd_kernel_code_t structure parsing/printing +// +//===----------------------------------------------------------------------===// + +#define QNAME(name) amd_kernel_code_t::name +#define FLD_T(name) decltype(QNAME(name)), &QNAME(name) + +#define FIELD2(sname, name) \ + RECORD(sname, printField<FLD_T(name)>, parseField<FLD_T(name)>) + +#define FIELD(name) FIELD2(name, name) + + +#define PRINTCODEPROP(name) \ + printBitField<FLD_T(code_properties),\ + AMD_CODE_PROPERTY_##name##_SHIFT,\ + AMD_CODE_PROPERTY_##name##_WIDTH> + +#define PARSECODEPROP(name) \ + parseBitField<FLD_T(code_properties),\ + AMD_CODE_PROPERTY_##name##_SHIFT,\ + AMD_CODE_PROPERTY_##name##_WIDTH> + +#define CODEPROP(name, shift) \ + RECORD(name, PRINTCODEPROP(shift), PARSECODEPROP(shift)) + +// have to define these lambdas because of Set/GetMacro +#define PRINTCOMP(GetMacro, Shift) \ +[](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \ + printName(OS, Name) << \ + (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \ +} +#define PARSECOMP(SetMacro, Shift) \ +[](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \ + int64_t Value = 0; \ + if (!expectAbsExpression(MCParser, Value, Err)) \ + return false; \ + C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \ + return true; \ +} + +#define COMPPGM(name, GetMacro, SetMacro, Shift) \ + RECORD(name, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift)) + +#define COMPPGM1(name, AccMacro) \ + COMPPGM(compute_pgm_rsrc1_##name, \ + G_00B848_##AccMacro, S_00B848_##AccMacro, 0) + +#define COMPPGM2(name, AccMacro) \ + COMPPGM(compute_pgm_rsrc2_##name, \ + G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32) + +/////////////////////////////////////////////////////////////////////////////// +// Begin of the table +// Define RECORD(name, print, parse) in your code to get field definitions +// and include this file + +FIELD2(kernel_code_version_major, amd_kernel_code_version_major), +FIELD2(kernel_code_version_minor, amd_kernel_code_version_minor), +FIELD2(machine_kind, amd_machine_kind), +FIELD2(machine_version_major, amd_machine_version_major), +FIELD2(machine_version_minor, amd_machine_version_minor), +FIELD2(machine_version_stepping, amd_machine_version_stepping), +FIELD(kernel_code_entry_byte_offset), +FIELD(kernel_code_prefetch_byte_size), +FIELD(max_scratch_backing_memory_byte_size), +FIELD(compute_pgm_resource_registers), +FIELD(workitem_private_segment_byte_size), +FIELD(workgroup_group_segment_byte_size), +FIELD(gds_segment_byte_size), +FIELD(kernarg_segment_byte_size), +FIELD(workgroup_fbarrier_count), +FIELD(wavefront_sgpr_count), +FIELD(workitem_vgpr_count), +FIELD(reserved_vgpr_first), +FIELD(reserved_vgpr_count), +FIELD(reserved_sgpr_first), +FIELD(reserved_sgpr_count), +FIELD(debug_wavefront_private_segment_offset_sgpr), +FIELD(debug_private_segment_buffer_sgpr), +FIELD(kernarg_segment_alignment), +FIELD(group_segment_alignment), +FIELD(private_segment_alignment), +FIELD(wavefront_size), +FIELD(call_convention), +FIELD(runtime_loader_kernel_symbol), + +COMPPGM1(vgprs, VGPRS), +COMPPGM1(sgprs, SGPRS), +COMPPGM1(priority, PRIORITY), +COMPPGM1(float_mode, FLOAT_MODE), +COMPPGM1(priv, PRIV), +COMPPGM1(dx10_clamp, DX10_CLAMP), +COMPPGM1(debug_mode, DEBUG_MODE), +COMPPGM1(ieee_mode, IEEE_MODE), +COMPPGM2(scratch_en, SCRATCH_EN), +COMPPGM2(user_sgpr, USER_SGPR), +COMPPGM2(tgid_x_en, TGID_X_EN), +COMPPGM2(tgid_y_en, TGID_Y_EN), +COMPPGM2(tgid_z_en, TGID_Z_EN), +COMPPGM2(tg_size_en, TG_SIZE_EN), +COMPPGM2(tidig_comp_cnt, TIDIG_COMP_CNT), +COMPPGM2(excp_en_msb, EXCP_EN_MSB), +COMPPGM2(lds_size, LDS_SIZE), +COMPPGM2(excp_en, EXCP_EN), + +CODEPROP(enable_sgpr_private_segment_buffer, + ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER), +CODEPROP(enable_sgpr_dispatch_ptr, + ENABLE_SGPR_DISPATCH_PTR), +CODEPROP(enable_sgpr_queue_ptr, + ENABLE_SGPR_QUEUE_PTR), +CODEPROP(enable_sgpr_kernarg_segment_ptr, + ENABLE_SGPR_KERNARG_SEGMENT_PTR), +CODEPROP(enable_sgpr_dispatch_id, + ENABLE_SGPR_DISPATCH_ID), +CODEPROP(enable_sgpr_flat_scratch_init, + ENABLE_SGPR_FLAT_SCRATCH_INIT), +CODEPROP(enable_sgpr_private_segment_size, + ENABLE_SGPR_PRIVATE_SEGMENT_SIZE), +CODEPROP(enable_sgpr_grid_workgroup_count_x, + ENABLE_SGPR_GRID_WORKGROUP_COUNT_X), +CODEPROP(enable_sgpr_grid_workgroup_count_y, + ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y), +CODEPROP(enable_sgpr_grid_workgroup_count_z, + ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z), +CODEPROP(enable_ordered_append_gds, + ENABLE_ORDERED_APPEND_GDS), +CODEPROP(private_element_size, + PRIVATE_ELEMENT_SIZE), +CODEPROP(is_ptr64, + IS_PTR64), +CODEPROP(is_dynamic_callstack, + IS_DYNAMIC_CALLSTACK), +CODEPROP(is_debug_enabled, + IS_DEBUG_SUPPORTED), +CODEPROP(is_xnack_enabled, + IS_XNACK_SUPPORTED) + +// end of the table +/////////////////////////////////////////////////////////////////////////////// + +#undef QNAME +#undef FLD_T +#undef FIELD2 +#undef FIELD +#undef PRINTCODEPROP +#undef PARSECODEPROP +#undef CODEPROP +#undef PRINTCOMP +#undef PAPSECOMP +#undef COMPPGM +#undef COMPPGM1 +#undef COMPPGM2 diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp new file mode 100644 index 000000000000..f64973afa44f --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -0,0 +1,166 @@ +//===--------------------AMDKernelCodeTUtils.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// +// +/// \file - utility functions to parse/print amd_kernel_code_t structure +// +//===----------------------------------------------------------------------===// + +#include "AMDKernelCodeTUtils.h" +#include "SIDefines.h" +#include <llvm/MC/MCParser/MCAsmLexer.h> +#include <llvm/MC/MCParser/MCAsmParser.h> +#include <llvm/Support/raw_ostream.h> + +using namespace llvm; + +static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() { + static StringRef const Table[] = { + "", // not found placeholder +#define RECORD(name, print, parse) #name +#include "AMDKernelCodeTInfo.h" +#undef RECORD + }; + return makeArrayRef(Table); +} + +static StringMap<int> createIndexMap(const ArrayRef<StringRef> &a) { + StringMap<int> map; + for (auto Name : a) + map.insert(std::make_pair(Name, map.size())); + return map; +} + +static int get_amd_kernel_code_t_FieldIndex(StringRef name) { + static const auto map = createIndexMap(get_amd_kernel_code_t_FldNames()); + return map.lookup(name) - 1; // returns -1 if not found +} + +static StringRef get_amd_kernel_code_t_FieldName(int index) { + return get_amd_kernel_code_t_FldNames()[index + 1]; +} + + +// Field printing + +static raw_ostream &printName(raw_ostream &OS, StringRef Name) { + return OS << Name << " = "; +} + +template <typename T, T amd_kernel_code_t::*ptr> +static void printField(StringRef Name, const amd_kernel_code_t &C, + raw_ostream &OS) { + printName(OS, Name) << (int)(C.*ptr); +} + +template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1> +static void printBitField(StringRef Name, const amd_kernel_code_t &c, + raw_ostream &OS) { + const auto Mask = (static_cast<T>(1) << width) - 1; + printName(OS, Name) << (int)((c.*ptr >> shift) & Mask); +} + +typedef void(*PrintFx)(StringRef, + const amd_kernel_code_t &, + raw_ostream &); + +static ArrayRef<PrintFx> getPrinterTable() { + static const PrintFx Table[] = { +#define RECORD(name, print, parse) print +#include "AMDKernelCodeTInfo.h" +#undef RECORD + }; + return makeArrayRef(Table); +} + +void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C, + int FldIndex, + raw_ostream &OS) { + auto Printer = getPrinterTable()[FldIndex]; + if (Printer) + Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS); +} + +void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C, + raw_ostream &OS, + const char *tab) { + const int Size = getPrinterTable().size(); + for (int i = 0; i < Size; ++i) { + OS << tab; + printAmdKernelCodeField(*C, i, OS); + OS << '\n'; + } +} + + +// Field parsing + +static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) { + + if (MCParser.getLexer().isNot(AsmToken::Equal)) { + Err << "expected '='"; + return false; + } + MCParser.getLexer().Lex(); + + if (MCParser.parseAbsoluteExpression(Value)) { + Err << "integer absolute expression expected"; + return false; + } + return true; +} + +template <typename T, T amd_kernel_code_t::*ptr> +static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser, + raw_ostream &Err) { + int64_t Value = 0; + if (!expectAbsExpression(MCParser, Value, Err)) + return false; + C.*ptr = (T)Value; + return true; +} + +template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1> +static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser, + raw_ostream &Err) { + int64_t Value = 0; + if (!expectAbsExpression(MCParser, Value, Err)) + return false; + const uint64_t Mask = ((UINT64_C(1) << width) - 1) << shift; + C.*ptr &= (T)~Mask; + C.*ptr |= (T)((Value << shift) & Mask); + return true; +} + +typedef bool(*ParseFx)(amd_kernel_code_t &, + MCAsmParser &MCParser, + raw_ostream &Err); + +static ArrayRef<ParseFx> getParserTable() { + static const ParseFx Table[] = { +#define RECORD(name, print, parse) parse +#include "AMDKernelCodeTInfo.h" +#undef RECORD + }; + return makeArrayRef(Table); +} + +bool llvm::parseAmdKernelCodeField(StringRef ID, + MCAsmParser &MCParser, + amd_kernel_code_t &C, + raw_ostream &Err) { + const int Idx = get_amd_kernel_code_t_FieldIndex(ID); + if (Idx < 0) { + Err << "unexpected amd_kernel_code_t field name " << ID; + return false; + } + auto Parser = getParserTable()[Idx]; + return Parser ? Parser(C, MCParser, Err) : false; +} diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h new file mode 100644 index 000000000000..d9edca7a82ac --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h @@ -0,0 +1,39 @@ +//===- AMDGPUKernelCodeTUtils.h - helpers for amd_kernel_code_t *- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file AMDKernelCodeTUtils.h +//===----------------------------------------------------------------------===// + +#ifndef AMDKERNELCODETUTILS_H +#define AMDKERNELCODETUTILS_H + +#include "AMDKernelCodeT.h" + +namespace llvm { + +class MCAsmLexer; +class MCAsmParser; +class raw_ostream; +class StringRef; + +void printAmdKernelCodeField(const amd_kernel_code_t &C, + int FldIndex, + raw_ostream &OS); + +void dumpAmdKernelCode(const amd_kernel_code_t *C, + raw_ostream &OS, + const char *tab); + +bool parseAmdKernelCodeField(StringRef ID, + MCAsmParser &Parser, + amd_kernel_code_t &C, + raw_ostream &Err); + +} + +#endif // AMDKERNELCODETUTILS_H diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt index 2c07aeab7dd3..01b80ebe8d3d 100644 --- a/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -1,3 +1,5 @@ add_llvm_library(LLVMAMDGPUUtils AMDGPUBaseInfo.cpp + AMDKernelCodeTUtils.cpp + AMDGPUAsmUtils.cpp ) diff --git a/lib/Target/AMDGPU/Utils/Makefile b/lib/Target/AMDGPU/Utils/Makefile deleted file mode 100644 index 1019e726d50e..000000000000 --- a/lib/Target/AMDGPU/Utils/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/AMDGPU/Utils/Makefile --------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMAMDGPUUtils - -# Hack: we need to include 'main' AMDGPU target directory to grab private -# headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/AMDGPU/VIInstrFormats.td b/lib/Target/AMDGPU/VIInstrFormats.td index d8738f992630..912ed5329bfe 100644 --- a/lib/Target/AMDGPU/VIInstrFormats.td +++ b/lib/Target/AMDGPU/VIInstrFormats.td @@ -91,21 +91,28 @@ class MTBUFe_vi <bits<4> op> : Enc64 { class SMEMe_vi <bits<8> op, bit imm> : Enc64 { bits<7> sbase; - bits<7> sdata; + bits<7> sdst; bits<1> glc; - bits<20> offset; let Inst{5-0} = sbase{6-1}; - let Inst{12-6} = sdata; + let Inst{12-6} = sdst; let Inst{16} = glc; let Inst{17} = imm; let Inst{25-18} = op; let Inst{31-26} = 0x30; //encoding +} + +class SMEM_IMMe_vi <bits<8> op> : SMEMe_vi<op, 1> { + bits<20> offset; let Inst{51-32} = offset; } -class VOP3e_vi <bits<10> op> : Enc64 { - bits<8> vdst; +class SMEM_SOFFe_vi <bits<8> op> : SMEMe_vi<op, 0> { + bits<20> soff; + let Inst{51-32} = soff; +} + +class VOP3a_vi <bits<10> op> : Enc64 { bits<2> src0_modifiers; bits<9> src0; bits<2> src1_modifiers; @@ -115,7 +122,6 @@ class VOP3e_vi <bits<10> op> : Enc64 { bits<1> clamp; bits<2> omod; - let Inst{7-0} = vdst; let Inst{8} = src0_modifiers{1}; let Inst{9} = src1_modifiers{1}; let Inst{10} = src2_modifiers{1}; @@ -131,6 +137,20 @@ class VOP3e_vi <bits<10> op> : Enc64 { let Inst{63} = src2_modifiers{0}; } +class VOP3e_vi <bits<10> op> : VOP3a_vi <op> { + bits<8> vdst; + + let Inst{7-0} = vdst; +} + +// Encoding used for VOPC instructions encoded as VOP3 +// Differs from VOP3e by destination name (sdst) as VOPC doesn't have vector dst +class VOP3ce_vi <bits<10> op> : VOP3a_vi <op> { + bits<8> sdst; + + let Inst{7-0} = sdst; +} + class VOP3be_vi <bits<10> op> : Enc64 { bits<8> vdst; bits<2> src0_modifiers; @@ -157,6 +177,117 @@ class VOP3be_vi <bits<10> op> : Enc64 { let Inst{63} = src2_modifiers{0}; } +class VOP_DPP <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> : + VOPAnyCommon <outs, ins, asm, pattern> { + let DPP = 1; + let Size = 8; + + let AsmMatchConverter = !if(!eq(HasMods,1), "cvtDPP", ""); +} + +class VOP_DPPe : Enc64 { + bits<2> src0_modifiers; + bits<8> src0; + bits<2> src1_modifiers; + bits<9> dpp_ctrl; + bits<1> bound_ctrl; + bits<4> bank_mask; + bits<4> row_mask; + + let Inst{39-32} = src0; + let Inst{48-40} = dpp_ctrl; + let Inst{51} = bound_ctrl; + let Inst{52} = src0_modifiers{0}; // src0_neg + let Inst{53} = src0_modifiers{1}; // src0_abs + let Inst{54} = src1_modifiers{0}; // src1_neg + let Inst{55} = src1_modifiers{1}; // src1_abs + let Inst{59-56} = bank_mask; + let Inst{63-60} = row_mask; +} + +class VOP1_DPPe <bits<8> op> : VOP_DPPe { + bits<8> vdst; + + let Inst{8-0} = 0xfa; // dpp + let Inst{16-9} = op; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; //encoding +} + +class VOP2_DPPe <bits<6> op> : VOP_DPPe { + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = 0xfa; //dpp + let Inst{16-9} = src1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding +} + +class VOP_SDWA <dag outs, dag ins, string asm, list<dag> pattern, bit HasMods = 0> : + VOPAnyCommon <outs, ins, asm, pattern> { + let SDWA = 1; + let Size = 8; +} + +class VOP_SDWAe : Enc64 { + bits<8> src0; + bits<3> src0_sel; + bits<2> src0_fmodifiers; // {abs,neg} + bits<1> src0_imodifiers; // sext + bits<3> src1_sel; + bits<2> src1_fmodifiers; + bits<1> src1_imodifiers; + bits<3> dst_sel; + bits<2> dst_unused; + bits<1> clamp; + + let Inst{39-32} = src0; + let Inst{42-40} = dst_sel; + let Inst{44-43} = dst_unused; + let Inst{45} = clamp; + let Inst{50-48} = src0_sel; + let Inst{53-52} = src0_fmodifiers; + let Inst{51} = src0_imodifiers; + let Inst{58-56} = src1_sel; + let Inst{61-60} = src1_fmodifiers; + let Inst{59} = src1_imodifiers; +} + +class VOP1_SDWAe <bits<8> op> : VOP_SDWAe { + bits<8> vdst; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = op; + let Inst{24-17} = vdst; + let Inst{31-25} = 0x3f; // encoding +} + +class VOP2_SDWAe <bits<6> op> : VOP_SDWAe { + bits<8> vdst; + bits<8> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = src1; + let Inst{24-17} = vdst; + let Inst{30-25} = op; + let Inst{31} = 0x0; // encoding +} + +class VOPC_SDWAe <bits<8> op> : VOP_SDWAe { + bits<8> src1; + + let Inst{8-0} = 0xf9; // sdwa + let Inst{16-9} = src1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; // encoding + + // VOPC disallows dst_sel and dst_unused as they have no effect on destination + let Inst{42-40} = 0x6; + let Inst{44-43} = 0x2; +} + class EXPe_vi : EXPe { let Inst{31-26} = 0x31; //encoding } diff --git a/lib/Target/AMDGPU/VIInstructions.td b/lib/Target/AMDGPU/VIInstructions.td index 1a7801c92bd7..5c490ab900f2 100644 --- a/lib/Target/AMDGPU/VIInstructions.td +++ b/lib/Target/AMDGPU/VIInstructions.td @@ -11,6 +11,8 @@ let SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI in { +let DisableSIDecoder = 1 in { + //===----------------------------------------------------------------------===// // VOP1 Instructions //===----------------------------------------------------------------------===// @@ -52,9 +54,9 @@ defm V_SUBREV_F16 : VOP2Inst <vop2<0, 0x21>, "v_subrev_f16", VOP_F16_F16_F16, defm V_MUL_F16 : VOP2Inst <vop2<0, 0x22>, "v_mul_f16", VOP_F16_F16_F16>; defm V_MAC_F16 : VOP2Inst <vop2<0, 0x23>, "v_mac_f16", VOP_F16_F16_F16>; } // End isCommutable = 1 -defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16">; +defm V_MADMK_F16 : VOP2MADK <vop2<0,0x24>, "v_madmk_f16", VOP_MADMK>; let isCommutable = 1 in { -defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16">; +defm V_MADAK_F16 : VOP2MADK <vop2<0,0x25>, "v_madak_f16", VOP_MADAK>; defm V_ADD_U16 : VOP2Inst <vop2<0,0x26>, "v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <vop2<0,0x27>, "v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <vop2<0,0x28>, "v_subrev_u16", VOP_I16_I16_I16>; @@ -73,6 +75,16 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; } // End isCommutable = 1 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// +let isCommutable = 1 in { + defm V_MAD_F16 : VOP3Inst <vop3<0, 0x1ea>, "v_mad_f16", VOP_F16_F16_F16_F16>; + defm V_MAD_U16 : VOP3Inst <vop3<0, 0x1eb>, "v_mad_u16", VOP_I16_I16_I16_I16>; + defm V_MAD_I16 : VOP3Inst <vop3<0, 0x1ec>, "v_mad_i16", VOP_I16_I16_I16_I16>; +} +} // let DisableSIDecoder = 1 + // Aliases to simplify matching of floating-point instructions that // are VOP2 on SI and VOP3 on VI. @@ -99,6 +111,9 @@ def S_DCACHE_WB : SMEM_Inval <0x21, def S_DCACHE_WB_VOL : SMEM_Inval <0x23, "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; +def S_MEMREALTIME : SMEM_Ret<0x25, + "s_memrealtime", int_amdgcn_s_memrealtime>; + } // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI let Predicates = [isVI] in { @@ -109,4 +124,35 @@ def : Pat < (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) >; +//===----------------------------------------------------------------------===// +// DPP Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask, + imm:$bound_ctrl), + (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask), + (as_i32imm $bank_mask), (as_i1imm $bound_ctrl)) +>; + +//===----------------------------------------------------------------------===// +// Misc Patterns +//===----------------------------------------------------------------------===// + +def : Pat < + (i64 (readcyclecounter)), + (S_MEMREALTIME) +>; + +//===----------------------------------------------------------------------===// +// DS_PERMUTE/DS_BPERMUTE Instructions. +//===----------------------------------------------------------------------===// + +let Uses = [EXEC] in { +defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE <0x3e, "ds_permute_b32", VGPR_32, + int_amdgcn_ds_permute>; +defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <0x3f, "ds_bpermute_b32", VGPR_32, + int_amdgcn_ds_bpermute>; +} + } // End Predicates = [isVI] |